Introduction
In this notebook we demonstrate how to use Milo to detect abherrant cell states in diseased tissues, using a dataset of hepatic non-parenchymal cells isolated from 5 healthy and 5 cirrhotic human livers. Ramachandran et al. 2019 (GEO accessiion: GSE136103).
suppressPackageStartupMessages({
library(tidyverse)
library(irlba)
library(DropletUtils)
library(scater)
library(scran)
library(Seurat) ## just 4 loading the object
library(miloR)
library(SingleCellExperiment)
library(patchwork)
library(igraph)
library(RColorBrewer)
library(cowplot)
})
Load data
We downloaded the dataset and annotations stored in Seurat object from here, as indicated by the authors.
load("/nfs/team205/ed6/data/Ramachandran2019_liver/tissue.rdata")
## Convert to SingleCellExperiment
liver_sce <- SingleCellExperiment(assay = list(counts=tissue@raw.data, logcounts=tissue@data),
colData = tissue@meta.data)
liver_sce
class: SingleCellExperiment
dim: 23498 58358
metadata(0):
assays(2): counts logcounts
rownames(23498): FO538757.2 AP006222.2 ... CTA-126B4.7 LINC01423
rowData names(0):
colnames(58358): Healthy1_Cd45+_AAACCTGCAGTATCTG
Healthy1_Cd45+_AACTGGTTCATGGTCA ...
Cirrhotic3_Cd45-_TTTGTCATCCAGGGCT
Cirrhotic3_Cd45-_TCTGGAAGTCATCCCT
colData names(10): nGene nUMI ... annotation_indepth
annotation_lineage
reducedDimNames(0):
spikeNames(0):
altExpNames(0):
Preprocessing
Feature selection
dec_liver <- modelGeneVar(liver_sce)
fit_liver <- metadata(dec_liver)
plot(fit_liver$mean, fit_liver$var, xlab="Mean of log-expression",
ylab="Variance of log-expression")

hvgs <- getTopHVGs(dec_liver, n=3000)
Dimensionality reduction with PCA
liver_sce <- runPCA(liver_sce, subset_row=hvgs, ncomponents=11)
plotPCA(liver_sce, colour_by="condition", ncomponents=3)

liver_sce <- runUMAP(liver_sce, dimred="PCA", ncomponents=2)
scater::plotUMAP(liver_sce, colour_by="condition", point_alpha=1, point_size=0.5)

scater::plotUMAP(liver_sce, colour_by="dataset", point_alpha=0.3, point_size=0.5)

scater::plotUMAP(liver_sce, colour_by="annotation_lineage", point_alpha=0.3, point_size=0.5, text_by='annotation_lineage')

# scater::plotUMAP(liver_sce, colour_by='annotation_indepth', point_alpha=0.3, point_size=0.5, text_by='annotation_indepth')
Notably, this dataset doesn’t appear to display a batch effect
DA analysis with Milo
We test for differential abundance between healthy and cirrhotic livers. We start by defining neighbourhoods with refined sampling on the KNN graph. We inspect the size of neighbourhoods.
liver_milo <- Milo(liver_sce)
## Build KNN graph
liver_milo <- buildGraph(liver_milo, d = 11, k=30)
Constructing kNN graph with k:30
## Compute neighbourhoods with refined sampling
liver_milo <- makeNhoods(liver_milo, k=30, d=11, prop = 0.05, refined=TRUE)
Checking valid object
plotNhoodSizeHist(liver_milo, bins=150)

Then we make a design matrix for the differential test, assigning samples to conditions.
liver_meta <- as.tibble(colData(liver_milo)[,c("dataset","condition")])
`as.tibble()` is deprecated as of tibble 2.0.0.
Please use `as_tibble()` instead.
The signature and semantics have changed, see `?as_tibble`.
[90mThis warning is displayed once every 8 hours.[39m
[90mCall `lifecycle::last_warnings()` to see where this warning was generated.[39m
liver_meta <- distinct(liver_meta) %>%
mutate(condition=factor(condition, levels=c("Uninjured", "Cirrhotic"))) %>%
column_to_rownames("dataset")
Now we can count cells in neighbourhoods and perform the DA test.
liver_milo <- countCells(liver_milo, samples = "dataset", meta.data = data.frame(colData(liver_milo)[,c("dataset","condition")]) )
Checking meta.data validity
Setting up matrix with 2717 neighbourhoods
Counting cells in neighbourhoods
liver_milo <- calcNhoodDistance(liver_milo, d=11)
milo_res <- testNhoods(liver_milo, design = ~ condition, design.df = liver_meta, fdr.weighting = "k-distance")
Performing spatial FDR correction withk-distance weighting
Exploration of DA results
We can start by looking at some basic stats
pval_hist <- milo_res %>%
ggplot(aes(PValue)) +
geom_histogram(bins=50) +
theme_bw(base_size=14)
volcano <-
milo_res %>%
ggplot(aes(logFC, -log10(SpatialFDR))) +
geom_point(size=0.4, alpha=0.2) +
geom_hline(yintercept = -log10(0.1)) +
xlab("log-Fold Change") +
theme_bw(base_size=14)
pval_hist + volcano

The distribution of P-values looks sensible and from the volcano plot we can see that we have identified some DA neighbourhoods at 10% FDR. We can visualize DA neighbourhoods building an abstracted graph
liver_milo <- buildNhoodGraph(liver_milo)
Calculating nhood adjacency
\
Error: unexpected input in "\"
## Save milo object and results
# saveRDS(liver_milo,"/nfs/team205/ed6/data/Ramachandran2019_liver/tissue_milo.RDS")
saveRDS(liver_milo,"~/liver_milo_20201008.RDS")
# write_csv(milo_res, "/nfs/team205/ed6/data/Ramachandran2019_liver/milo_results.csv")
write_csv(milo_res,"/nfs/team205/ed6/data/Ramachandran2019_liver/liver_results_20201008.csv")
Parsed with column specification:
cols(
logFC = [32mcol_double()[39m,
logCPM = [32mcol_double()[39m,
F = [32mcol_double()[39m,
PValue = [32mcol_double()[39m,
FDR = [32mcol_double()[39m,
Nhood = [32mcol_double()[39m,
SpatialFDR = [32mcol_double()[39m
)
Making figures for the manuscript
colourCount = length(unique(liver_milo$annotation_lineage))
getPalette = colorRampPalette(brewer.pal(9, "Set2"))
n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
umap_df <- data.frame(reducedDim(liver_milo, "UMAP"))
colnames(umap_df) <- c("UMAP_1", "UMAP_2")
umap1 <- cbind(umap_df, annotation_lineage=liver_milo$annotation_lineage) %>%
ggplot(aes(UMAP_1, UMAP_2, color=as.character(annotation_lineage))) +
geom_point(size=0.1, alpha=0.5) +
ggrepel::geom_text_repel(data = . %>%
group_by(annotation_lineage) %>%
summarise(UMAP_1=mean(UMAP_1), UMAP_2=mean(UMAP_2)),
aes(label=annotation_lineage), color="black", size=6
) +
scale_color_manual(values=getPalette(colourCount)) +
guides(color="none") +
xlab("UMAP1") + ylab("UMAP2") +
coord_fixed() +
theme_classic(base_size = 22) +
theme(axis.text = element_blank(), axis.ticks = element_blank())
umap2 <-
cbind(umap_df, condition=as.character(liver_milo$condition)) %>%
ggplot(aes(UMAP_1, UMAP_2, color=condition)) +
geom_point(size=0.1, alpha=0.5) +
scale_color_brewer(palette="Set1", name='') +
xlab("UMAP1") + ylab("UMAP2") +
coord_fixed() +
guides(color='none') +
facet_wrap(condition~., ncol=1) +
theme_nothing(font_size = 22) +
theme(axis.text = element_blank(), axis.ticks = element_blank(), legend.position=c(0.9,0.9),
strip.background = element_rect(color=NA), strip.text = element_text(size=22))
nh_graph_pl <- plotNhoodGraphDA(liver_milo, milo_res, alpha = 0.1) +
theme(legend.text = element_text(size=22), legend.title = element_text(size=24)) +
coord_fixed()
invalid factor level, NA generatedinvalid factor level, NA generatedinvalid factor level, NA generatedinvalid factor level, NA generatedinvalid factor level, NA generatedinvalid factor level, NA generatedinvalid factor level, NA generated
fig4_top <- (umap1 | umap2 | nh_graph_pl) +
plot_layout(widths = c(3,1,3))
fig4_top +
ggsave("~/milo_output/liver_embedding.pdf", width=15, height = 10)

Next, we can check the cell types where we observe most differences between healthy and cirrhotic cells, by taking the most frequent cell type in each neighbourhood.
# Add annotation of most frequent cell type per nhood to milo results table
milo_res <- annotateNhoods(liver_milo, milo_res, "annotation_indepth")
anno_df <- data.frame(liver_milo@colData) %>%
distinct(annotation_lineage, annotation_indepth)
milo_res <- left_join(milo_res, anno_df, by="annotation_indepth")
We first check that neighbourhoods are quite homogeneous
frac_hist <- ggplot(milo_res, aes(annotation_indepth_fraction)) +
geom_histogram(bins=30) +
xlab("Fraction of cells in \nmost abundant cluster") +
ylab("# neighbourhoods") +
theme_bw(base_size=14)
frac_hist

I can recover all the clusters where DA was detected in the original paper (see all the barplots for each lineage) and more! All in a single analysis, and without knowing where the subclusters are. Let’s bear in mind that positive logFC
group.by = "annotation_indepth"
paper_DA <- list(cirrhotic=c("MPs (4)","MPs (5)",
"Endothelia (6)", "Endothelia (7)",
"Mes (3)",
"Tcells (2)",
"Myofibroblasts"
),
healthy=c("MPs (7)",
"Endothelia (1)",
"Tcells (1)", "Tcells (3)","Tcells (1)",
"ILCs (1)"
)
)
expDA_df <- bind_rows(
data.frame(annotation_indepth = paper_DA[["cirrhotic"]], pred_DA="cirrhotic"),
data.frame(annotation_indepth = paper_DA[["healthy"]], pred_DA="healthy")
)
pl1 <- milo_res %>%
left_join(expDA_df) %>%
mutate(is_signif = ifelse(SpatialFDR < 0.1, 1, 0)) %>%
mutate(logFC_color = ifelse(is_signif==1, logFC, NA)) %>%
arrange(annotation_lineage) %>%
mutate(Nhood=factor(Nhood, levels=unique(Nhood))) %>%
ggplot(aes(annotation_indepth, logFC, color=logFC_color)) +
scale_color_gradient2() +
guides(color="none") +
xlab(group.by) + ylab("Log Fold Change") +
ggbeeswarm::geom_quasirandom(alpha=1) +
coord_flip() +
facet_grid(annotation_lineage~., scales="free", space="free") +
theme_bw(base_size=22) +
theme(strip.text.y = element_text(angle=0),
axis.title.y = element_blank(), axis.text.y = element_blank(), axis.ticks.y = element_blank(),
)
Joining, by = "annotation_indepth"
pl2 <- milo_res %>%
left_join(expDA_df) %>%
# dplyr::filter(!is.na(pred_DA)) %>%
group_by(annotation_indepth) %>%
summarise(pred_DA=dplyr::first(pred_DA), annotation_lineage=dplyr::first(annotation_lineage)) %>%
mutate(end=ifelse(pred_DA=="healthy", 0, 1),
start=ifelse(pred_DA=="healthy", 1, 0)) %>%
ggplot(aes(annotation_indepth, start, xend = annotation_indepth, yend = end, color=pred_DA)) +
geom_segment(size=1,arrow=arrow(length = unit(0.1, "npc"), type="closed")) +
coord_flip() +
xlab("annotation") +
facet_grid(annotation_lineage~.,
# annotation_lineage~"Ramachandran et al.\nDA predictions",
scales="free", space="free") +
# guides(color="none") +
scale_color_brewer(palette="Set1", direction = -1,
labels=c("enriched in cirrhotic", "enriched in healthy"),
na.translate = F,
name="Ramachandran et al.\nDA predictions") +
guides(color=guide_legend(ncol = 1)) +
theme_bw(base_size=22) +
ylim(-0.1,1.1) +
theme(strip.text.y = element_blank(),strip.text.x = element_text(angle=90),
plot.margin = unit(c(0,0,0,0), "cm"), panel.grid = element_blank(),
axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank(),
legend.position = "bottom")
Joining, by = "annotation_indepth"
`summarise()` ungrouping output (override with `.groups` argument)
fig4_bleft <- (pl2 + pl1 +
plot_layout(widths=c(1,10), guides = "collect") & theme(legend.position = 'top', legend.justification = 0))
fig4_bleft +
ggsave("~/milo_output/liver_DAcomparison.pdf", width=8, height = 13)

Close-up on Endothelial lineage
endo_milo <- scater::runUMAP(liver_milo[,liver_milo$annotation_lineage=="Endothelia"], dimred='PCA')
plotUMAP(endo_milo, colour_by = "annotation_indepth")

Filter out cells that show contamination from immune cells (expression of immune markers)
keep <- logcounts(endo_milo)["CD19",] == 0 | logcounts(endo_milo)["MS4A1",] == 0
endo_milo <- endo_milo[,keep]
endo_milo <- scater::runUMAP(endo_milo, dimred='PCA')
plotUMAP(endo_milo, colour_by = "annotation_indepth")

umap_df <- data.frame(reducedDim(endo_milo, "UMAP"))
colnames(umap_df) <- c("UMAP_1", "UMAP_2")
endo_umap <- cbind(umap_df, condition=endo_milo$condition) %>%
ggplot(aes(UMAP_1, UMAP_2, color=condition)) +
geom_point(size=0.3, alpha=0.5) +
scale_color_brewer(palette="Set1", name='') +
xlab("UMAP1") + ylab("UMAP2") +
coord_fixed() +
guides(color="none") +
facet_wrap(condition~., ncol=1) +
theme_nothing() +
theme(axis.text = element_blank(), axis.ticks = element_blank(), legend.position=c(0.9,0.9),
strip.background = element_rect(color=NA), strip.text = element_text(size=22))
liver_milo2 <- liver_milo
subset.nhoods <- str_detect(milo_res$annotation_indepth, "Endo")
reducedDim(liver_milo2, "UMAP")[colnames(endo_milo),] <- reducedDim(endo_milo, "UMAP")
endo_gr <-
plotNhoodGraphDA(
liver_milo2, milo_res,
subset.nhoods = subset.nhoods,
# ) =)[1:(length()-1)],
alpha = 0.1
) +
theme(legend.text = element_text(size=22), legend.title = element_text(size=24))
fig4_bright1 <- ((endo_umap + endo_gr ) +
plot_layout(widths = c(1,2),
guides = "collect"
))
fig4_bright1 +
ggsave("~/milo_output/liver_endoGraph.pdf", width=9, height = 5)

Differential expression between DA neighbourhoods
We merge overlapping nhoods with significant DA and the same direction of logFC, and then test for differential expression between cells in disease-specific and healthy-specific nhoods. This allows us to perform a comparison without further clustering.
Here we find markers grouping gene expression counts by sample (i.e. we don’t treat cells as replicates, but exploit the replication structure used also for DA testing)

x
class: Milo
dim: 23498 5883
metadata(0):
assays(2): counts logcounts
rownames(23498): FO538757.2 AP006222.2 ... CTA-126B4.7 LINC01423
rowData names(0):
colnames(5883): Healthy1_Cd45+_CCTCAGTTCCAAGTAC Healthy1_Cd45-A_TTCTCCTAGGGATGGG ... Cirrhotic3_Cd45-_TGTTCCGTCCAAACAC
Cirrhotic3_Cd45-_TCTGGAAGTCATCCCT
colData names(10): nGene nUMI ... annotation_indepth annotation_lineage
reducedDimNames(2): PCA UMAP
spikeNames(0):
altExpNames(0):
nhoods dimensions(1): 2717
nhoodCounts dimensions(2): 2717 20
nhoodDistances dimension(1): 2717
graph names(1): graph
nhoodIndex names(1): 2717
nhoodExpression dimension(2): 1 1
nhoodReducedDim names(0):
nhoodGraph names(1): nhoodGraph
nhoodAdjacency dimension(0):
Significant DGE genes (FDR 10%)
marker.df[marker.df$adj.P.Val_1 < 0.1,]
Visualize as volcano

Visualize as heatmap
(gene expression values are scaled between 0 and 1 for each gene)

GO term analysis
[1] "FCN3" "MS4A6A" "CRHBP" "CLEC4G" "FCN2" "CLEC1B" "CLEC4M" "ACP5" "LYVE1" "OIT3" "TFPI2" "MEG3" "STAB2" "FCGR2B"
[15] "BEX1" "BASP1" "ANPEP" "RELN" "LYPD2" "ECM1" "NPL" "MRO" "ACSM3" "SLC7A8" "PVALB" "EHD3"
em_res_up %>%
top_n(20, -log10(qvalue)) %>%
mutate(Term=factor(ID, levels=rev(unique(ID)))) %>%
ggplot(aes(Term, -log10(qvalue))) +
geom_point() +
coord_flip() +
xlab("GO Biological Function") + ylab("-log10(Adj. p-value)") +
theme_bw(base_size=14) +
ggtitle("Markers of cirrhotic")

em_res_down %>%
top_n(30, -log10(qvalue)) %>%
mutate(Term=factor(ID, levels=rev(unique(ID)))) %>%
ggplot(aes(Term, -log10(qvalue))) +
geom_point() +
coord_flip() +
xlab("GO Biological Function") + ylab("-log10(Adj. p-value)") +
theme_bw(base_size=14) +
ggtitle("Markers of healthy")

Full GO enrichment - up in cirrhotic
(geneID column indicates which genes belong to gene ontology)
em_res_up
Full GO enrichment - up in healty
(geneID column indicates which genes belong to gene ontology)
em_res_down
Close-up on Cholangiocytes
Filter out cells that show contamination from immune cells (expression of immune markers)

umap_df <- data.frame(reducedDim(chol_milo, "UMAP"))
colnames(umap_df) <- c("UMAP_1", "UMAP_2")
chol_umap <- cbind(umap_df, condition=chol_milo$condition) %>%
ggplot(aes(UMAP_1, UMAP_2, color=condition)) +
geom_point(size=0.3, alpha=0.5) +
scale_color_brewer(palette="Set1", name='') +
xlab("UMAP1") + ylab("UMAP2") +
coord_fixed() +
guides(color="none") +
facet_wrap(condition~., ncol=1) +
theme_nothing() +
theme(axis.text = element_blank(), axis.ticks = element_blank(), legend.position=c(0.9,0.9),
strip.background = element_rect(color=NA), strip.text = element_text(size=22))
chol_umap

liver_milo2 <- liver_milo
subset.nhoods <- str_detect(milo_res$annotation_indepth, "Cholangio")
reducedDim(liver_milo2, "UMAP")[colnames(chol_milo),] <- reducedDim(chol_milo, "UMAP")
chol_gr <-
plotNhoodGraphDA(
liver_milo2, milo_res,
subset.nhoods = subset.nhoods,
# ) =)[1:(length()-1)],
alpha = 0.1
) +
theme(legend.text = element_text(size=22), legend.title = element_text(size=24))
(chol_umap + chol_gr ) +
plot_layout(widths = c(1,2),
guides = "collect"
)

# fig4_bright1 +
# ggsave("~/milo_output/liver_endoGraph.pdf", width=9, height = 5)
Differential expression between DA neighbourhoods
We merge overlapping nhoods with significant DA and the same direction of logFC, and then test for differential expression between cells in disease-specific and healthy-specific nhoods. This allows us to perform a comparison without further clustering.
Here we find markers grouping gene expression counts by samples (i.e. we don’t treat cells as replicates, but exploit the replication structure used also for DA testing)
nhoodAdjacency found - using for nhood grouping
Nhoods aggregated into 2 groups
Table of significant DGE genes (FDR 10%)
marker.df[marker.df$adj.P.Val_1 < 0.1,]
Visualize as volcano
marker.df %>%
mutate(adj.P.Val_1 = ifelse(- log10(adj.P.Val_1) > 300, 1e-300, adj.P.Val_1)) %>%
mutate(label=ifelse(-log10(adj.P.Val_1) > 1.7, GeneID, NA)) %>%
ggplot(aes(logFC_2, -log10(adj.P.Val_1),
# color=highlight
)) +
geom_point() +
ggrepel::geom_text_repel(aes(label=label)) +
xlab("logFC") + ylab("- log10(Adj. P value)") +
theme_bw(base_size = 22)

NA
NA
Visualize as heatmap
(gene expression values are scaled between 0 and 1 for each gene)
marker_genes_up <- marker.df %>%
dplyr::filter(adj.P.Val_1 < 0.01 & logFC_2 > 2) %>%
pull(GeneID)
plotNhoodExpressionDA(liver_milo, milo_res, marker_genes_up, cluster_features = TRUE,
alpha = 0.1,
scale_to_1 = TRUE,
subset.nhoods = milo_res$annotation_lineage=="Cholangiocytes",
show_rownames = TRUE
) +
ylab("DE genes") +
theme(legend.text = element_text(size=22), legend.title = element_text(size=24)) +
plot_layout(heights = c(1,10))

NA
NA
GO term analysis
BiocManager::install('clusterProfiler')
Bioconductor version 3.10 (BiocManager 1.30.10), R 3.6.3 (2020-02-29)
Installing package(s) 'clusterProfiler'
trying URL 'https://bioconductor.org/packages/3.10/bioc/src/contrib/clusterProfiler_3.14.3.tar.gz'
Content type 'application/x-gzip' length 2634460 bytes (2.5 MB)
==================================================
downloaded 2.5 MB
* installing *source* package ‘clusterProfiler’ ...
** using staged installation
** R
** data
** inst
** byte-compile and prepare package for lazy loading
** help
*** installing help indices
** building package indices
** installing vignettes
** testing if installed package can be loaded from temporary location
** testing if installed package can be loaded from final location
Execution halted
* removing ‘/home/jovyan/R/x86_64-pc-linux-gnu-library/3.6/clusterProfiler’
* restoring previous ‘/home/jovyan/R/x86_64-pc-linux-gnu-library/3.6/clusterProfiler’
em_res_up %>%
top_n(40, -log10(qvalue)) %>%
mutate(Term=factor(ID, levels=rev(unique(ID)))) %>%
ggplot(aes(Term, -log10(qvalue))) +
geom_point() +
coord_flip() +
xlab("GO Biological Function") + ylab("-log10(Adj. p-value)") +
theme_bw(base_size=14) +
ggtitle("Markers of healthy")

Full GO enrichment - up in cirrhotic
(geneID column indicates which genes belong to gene ontology)
em_res_up
Assemble figure
fig4_bottom <- ((fig4_bleft + plot_layout()) |
((fig4_bright1 + plot_layout(tag_level = 'keep')) / (fig4_bbright + plot_layout())) +
plot_layout(heights = c(1,1.6))
) +
plot_layout(widths=c(1,1.2))
(fig4_top / fig4_bottom) +
plot_layout(heights=c(1,1.8)) +
ggsave("~/milo_output/fig4.pdf", height = 26, width = 22, useDingbats=FALSE)
# ggsave("~/milo/ms/figures/figs/figure5.pdf", height = 26, width = 22, useDingbats=FALSE)
Assemble supplementary figure
p1 <- plot_grid(pval_hist,
volcano + ylab("- log10(Spatial FDR)"),
frac_hist, nrow=1,
labels = c("A", "B", "C"))
cowplot::plot_grid(p1, go_plot, rel_heights = c(1,1.5), ncol=1, labels = c("", "D")) +
ggsave("~/milo/ms/supplement/suppl_figs/suppl_fig6.pdf", height = 7, width=8)
ggsave("~/milo_output/liver_supplementary.png", height = 7, width=8)
---
title: "Milo: liver cirrhosis analysis"
output: 
  html_notebook:
    code_folding: hide
---

## Introduction

In this notebook we demonstrate how to use Milo to detect abherrant cell states in diseased tissues, using a dataset of hepatic non-parenchymal cells isolated from 5 healthy and 5 cirrhotic human livers. [Ramachandran et al. 2019](https://www.nature.com/articles/s41586-019-1631-3#Sec1) (GEO accessiion: GSE136103).

```{r}
suppressPackageStartupMessages({
  library(tidyverse)
  library(irlba)
  library(DropletUtils)
  library(scater)
  library(scran)
  library(Seurat) ## just 4 loading the object
  library(miloR)
  library(SingleCellExperiment)
  library(patchwork)
  library(igraph)
  library(RColorBrewer)
  library(cowplot)
  })
```

## Load data

We downloaded the dataset and annotations stored in Seurat object from [here](https://datashare.is.ed.ac.uk/handle/10283/3433), as indicated by the authors.

```{r}
load("/nfs/team205/ed6/data/Ramachandran2019_liver/tissue.rdata")

## Convert to SingleCellExperiment
liver_sce <- SingleCellExperiment(assay = list(counts=tissue@raw.data, logcounts=tissue@data),
                                  colData = tissue@meta.data)

liver_sce
```

## Preprocessing

### Feature selection

```{r}
dec_liver <- modelGeneVar(liver_sce)

fit_liver <- metadata(dec_liver)
plot(fit_liver$mean, fit_liver$var, xlab="Mean of log-expression",
    ylab="Variance of log-expression")

hvgs <- getTopHVGs(dec_liver, n=3000)
```

### Dimensionality reduction with PCA

```{r, fig.height=14, fig.width=14}
liver_sce <- runPCA(liver_sce, subset_row=hvgs, ncomponents=11)

plotPCA(liver_sce, colour_by="condition", ncomponents=3)
```

```{r, fig.height=8, fig.width=8}
liver_sce <- runUMAP(liver_sce, dimred="PCA", ncomponents=2)

scater::plotUMAP(liver_sce, colour_by="condition", point_alpha=1,  point_size=0.5)
scater::plotUMAP(liver_sce, colour_by="dataset", point_alpha=0.3,  point_size=0.5)
scater::plotUMAP(liver_sce, colour_by="annotation_lineage", point_alpha=0.3,  point_size=0.5, text_by='annotation_lineage')
# scater::plotUMAP(liver_sce, colour_by='annotation_indepth', point_alpha=0.3,  point_size=0.5, text_by='annotation_indepth')
```

Notably, this dataset doesn't appear to display a batch effect

## DA analysis with Milo

We test for differential abundance between healthy and cirrhotic livers. We start by defining neighbourhoods with refined sampling on the KNN graph. We inspect the size of neighbourhoods.

```{r}
liver_milo <- Milo(liver_sce)

## Build KNN graph
liver_milo <- buildGraph(liver_milo, d = 11, k=30)

## Compute neighbourhoods with refined sampling
liver_milo <- makeNhoods(liver_milo, k=30, d=11, prop = 0.05, refined=TRUE)
plotNhoodSizeHist(liver_milo, bins=150)
```

Then we make a design matrix for the differential test, assigning samples to conditions.

```{r}
liver_meta <- as.tibble(colData(liver_milo)[,c("dataset","condition")])
liver_meta <- distinct(liver_meta) %>%
  mutate(condition=factor(condition, levels=c("Uninjured", "Cirrhotic"))) %>%
  column_to_rownames("dataset")
```

Now we can count cells in neighbourhoods and perform the DA test.

```{r}
liver_milo <- countCells(liver_milo, samples = "dataset", meta.data = data.frame(colData(liver_milo)[,c("dataset","condition")]) )

liver_milo <- calcNhoodDistance(liver_milo, d=11)
milo_res <- testNhoods(liver_milo, design = ~ condition, design.df = liver_meta, fdr.weighting = "k-distance")
```

## Exploration of DA results

We can start by looking at some basic stats

```{r}
pval_hist <- milo_res %>%
  ggplot(aes(PValue)) +
  geom_histogram(bins=50) +
  theme_bw(base_size=14)

volcano <-
  milo_res %>%
  ggplot(aes(logFC, -log10(SpatialFDR))) +
  geom_point(size=0.4, alpha=0.2) +
  geom_hline(yintercept = -log10(0.1)) +
  xlab("log-Fold Change") +
  theme_bw(base_size=14)

pval_hist + volcano
```

The distribution of P-values looks sensible and from the volcano plot we can see that we have identified some DA neighbourhoods at 10% FDR.
We can visualize DA neighbourhoods building an abstracted graph

```{r, fig.width=14, fig.height=10}
liver_milo <- buildNhoodGraph(liver_milo)
plotNhoodGraphDA(liver_milo, milo_res, alpha = 0.05)
```

```{r}
## Save milo object and results
# saveRDS(liver_milo,"/nfs/team205/ed6/data/Ramachandran2019_liver/tissue_milo.RDS")
saveRDS(liver_milo,"~/liver_milo_20201008.RDS")
# write_csv(milo_res, "/nfs/team205/ed6/data/Ramachandran2019_liver/milo_results.csv")
write_csv(milo_res,"/nfs/team205/ed6/data/Ramachandran2019_liver/liver_results_20201008.csv")
```
```{r, echo=FALSE}
liver_milo <- readRDS("~/liver_milo_20201008.RDS")
# milo_res <- read_csv("~/liver_results_20201008.csv")
milo_res <- read_csv("/nfs/team205/ed6/data/Ramachandran2019_liver/liver_results_20201008.csv")
```

Making figures for the manuscript

```{r, fig.width=15, fig.height=10}
colourCount = length(unique(liver_milo$annotation_lineage))
getPalette = colorRampPalette(brewer.pal(9, "Set2"))

umap_df <- data.frame(reducedDim(liver_milo, "UMAP"))
colnames(umap_df) <- c("UMAP_1", "UMAP_2")

umap1 <- cbind(umap_df, annotation_lineage=liver_milo$annotation_lineage) %>%
  ggplot(aes(UMAP_1, UMAP_2, color=as.character(annotation_lineage))) +
  geom_point(size=0.1, alpha=0.5) +
  ggrepel::geom_text_repel(data = . %>%
              group_by(annotation_lineage) %>%
              summarise(UMAP_1=mean(UMAP_1), UMAP_2=mean(UMAP_2)),
            aes(label=annotation_lineage), color="black", size=6
            ) +
  scale_color_manual(values=getPalette(colourCount)) +
  guides(color="none") +
  xlab("UMAP1") + ylab("UMAP2") +
  coord_fixed() +
  theme_classic(base_size = 22) +
  theme(axis.text = element_blank(), axis.ticks = element_blank())

umap2 <-
  cbind(umap_df, condition=as.character(liver_milo$condition)) %>%
  ggplot(aes(UMAP_1, UMAP_2, color=condition)) +
  geom_point(size=0.1, alpha=0.5) +
  scale_color_brewer(palette="Set1", name='') +
  xlab("UMAP1") + ylab("UMAP2") +
  coord_fixed() +
  guides(color='none') +
  facet_wrap(condition~., ncol=1) +
  theme_nothing(font_size = 22) +
  theme(axis.text = element_blank(), axis.ticks = element_blank(), legend.position=c(0.9,0.9),
        strip.background = element_rect(color=NA), strip.text = element_text(size=22))

nh_graph_pl <- plotNhoodGraphDA(liver_milo, milo_res, alpha = 0.1) +
  theme(legend.text = element_text(size=22), legend.title = element_text(size=24)) +
  coord_fixed()

fig4_top <- (umap1 | umap2 | nh_graph_pl) +
  plot_layout(widths = c(3,1,3))

fig4_top +
  ggsave("~/milo_output/liver_embedding.pdf", width=15, height = 10)
```

Next, we can check the cell types where we observe most differences between healthy and cirrhotic cells, by taking the most frequent cell type in each neighbourhood.

```{r, fig.width=9, fig.height=10}
# Add annotation of most frequent cell type per nhood to milo results table
milo_res <- annotateNhoods(liver_milo, milo_res, "annotation_indepth")
anno_df <- data.frame(liver_milo@colData) %>%
  distinct(annotation_lineage, annotation_indepth)
milo_res <- left_join(milo_res, anno_df, by="annotation_indepth")
```

We first check that neighbourhoods are quite homogeneous

```{r}
frac_hist <- ggplot(milo_res, aes(annotation_indepth_fraction)) +
  geom_histogram(bins=30) +
  xlab("Fraction of cells in \nmost abundant cluster") +
  ylab("# neighbourhoods") +
  theme_bw(base_size=14)

frac_hist
```

I can recover all the clusters where DA was detected in the original paper (see all the barplots for each lineage) and more! All in a single analysis, and without knowing where the subclusters are. Let's bear in mind that positive logFC

```{r, fig.width=10, fig.height=10}
group.by = "annotation_indepth"
paper_DA <- list(cirrhotic=c("MPs (4)","MPs (5)",
                             "Endothelia (6)", "Endothelia (7)",
                             "Mes (3)",
                             "Tcells (2)",
                             "Myofibroblasts"
                             ),
                 healthy=c("MPs (7)",
                           "Endothelia (1)",
                           "Tcells (1)", "Tcells (3)","Tcells (1)",
                           "ILCs (1)"
                           )
                 )

expDA_df <- bind_rows(
  data.frame(annotation_indepth = paper_DA[["cirrhotic"]], pred_DA="cirrhotic"),
  data.frame(annotation_indepth = paper_DA[["healthy"]], pred_DA="healthy")
  )

pl1 <- milo_res %>%
  left_join(expDA_df) %>%
  mutate(is_signif = ifelse(SpatialFDR < 0.1, 1, 0)) %>%
  mutate(logFC_color = ifelse(is_signif==1, logFC, NA)) %>%
  arrange(annotation_lineage) %>%
  mutate(Nhood=factor(Nhood, levels=unique(Nhood))) %>%
  ggplot(aes(annotation_indepth, logFC, color=logFC_color)) +
  scale_color_gradient2() +
  guides(color="none") +
  xlab(group.by) + ylab("Log Fold Change") +
  ggbeeswarm::geom_quasirandom(alpha=1) +
  coord_flip() +
  facet_grid(annotation_lineage~., scales="free", space="free") +
  theme_bw(base_size=22) +
  theme(strip.text.y =  element_text(angle=0),
        axis.title.y = element_blank(), axis.text.y = element_blank(), axis.ticks.y = element_blank(),
        )

pl2 <- milo_res %>%
  left_join(expDA_df) %>%
  # dplyr::filter(!is.na(pred_DA)) %>%
  group_by(annotation_indepth) %>%
  summarise(pred_DA=dplyr::first(pred_DA), annotation_lineage=dplyr::first(annotation_lineage)) %>%
  mutate(end=ifelse(pred_DA=="healthy", 0, 1),
         start=ifelse(pred_DA=="healthy", 1, 0)) %>%
  ggplot(aes(annotation_indepth, start, xend = annotation_indepth, yend = end, color=pred_DA)) +
  geom_segment(size=1,arrow=arrow(length = unit(0.1, "npc"), type="closed")) +
  coord_flip() +
  xlab("annotation") +
  facet_grid(annotation_lineage~.,
    # annotation_lineage~"Ramachandran et al.\nDA predictions",
             scales="free", space="free") +
  # guides(color="none") +
  scale_color_brewer(palette="Set1", direction = -1,
                     labels=c("enriched in cirrhotic", "enriched in healthy"),
                     na.translate = F,
                     name="Ramachandran et al.\nDA predictions") +
  guides(color=guide_legend(ncol = 1)) +
  theme_bw(base_size=22) +
  ylim(-0.1,1.1) +
  theme(strip.text.y = element_blank(),strip.text.x = element_text(angle=90),
        plot.margin = unit(c(0,0,0,0), "cm"), panel.grid = element_blank(),
        axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank(),
        legend.position = "bottom")

fig4_bleft <- (pl2 + pl1 +
  plot_layout(widths=c(1,10), guides = "collect") & theme(legend.position = 'top', legend.justification = 0))

fig4_bleft +
  ggsave("~/milo_output/liver_DAcomparison.pdf", width=8, height = 13)
```

## Close-up on Endothelial lineage

```{r}
endo_milo <- scater::runUMAP(liver_milo[,liver_milo$annotation_lineage=="Endothelia"],  dimred='PCA')
plotUMAP(endo_milo, colour_by = "annotation_indepth")
```

Filter out cells that show contamination from immune cells (expression of immune markers)

```{r}
keep <- logcounts(endo_milo)["CD19",] == 0 | logcounts(endo_milo)["MS4A1",] == 0
endo_milo <- endo_milo[,keep]
endo_milo <- scater::runUMAP(endo_milo,  dimred='PCA')

plotUMAP(endo_milo, colour_by = "annotation_indepth")
```

```{r}
umap_df <- data.frame(reducedDim(endo_milo, "UMAP"))
colnames(umap_df) <- c("UMAP_1", "UMAP_2")

endo_umap <- cbind(umap_df, condition=endo_milo$condition) %>%
   ggplot(aes(UMAP_1, UMAP_2, color=condition)) +
  geom_point(size=0.3, alpha=0.5) +
  scale_color_brewer(palette="Set1", name='') +
  xlab("UMAP1") + ylab("UMAP2") +
  coord_fixed() +
  guides(color="none") +
  facet_wrap(condition~., ncol=1) +
  theme_nothing() +
  theme(axis.text = element_blank(), axis.ticks = element_blank(), legend.position=c(0.9,0.9),
        strip.background = element_rect(color=NA), strip.text = element_text(size=22))
```

```{r, fig.width=8, fig.height=4, message=FALSE, warning=FALSE}
liver_milo2 <- liver_milo
subset.nhoods <- str_detect(milo_res$annotation_indepth, "Endo")
reducedDim(liver_milo2, "UMAP")[colnames(endo_milo),] <- reducedDim(endo_milo, "UMAP") 


endo_gr <-
  plotNhoodGraphDA(
  liver_milo2, milo_res,
  subset.nhoods = subset.nhoods, 
  # ) =)[1:(length()-1)], 
  alpha = 0.1
  )  +
   theme(legend.text = element_text(size=22), legend.title = element_text(size=24))
  
fig4_bright1 <- ((endo_umap + endo_gr ) + 
  plot_layout(widths = c(1,2), 
                guides = "collect"
                )) 
fig4_bright1 +
  ggsave("~/milo_output/liver_endoGraph.pdf", width=9, height = 5)  
```


### Differential expression between DA neighbourhoods

We merge overlapping nhoods with significant DA and the same direction of logFC, and then test for differential expression between cells in disease-specific and healthy-specific nhoods. This allows us to perform a comparison without further clustering.

Here we find markers grouping gene expression counts by sample (i.e. we don't treat cells as replicates, but exploit the replication structure used also for DA testing)
```{r}
library(limma)
library(edgeR)

nhs <- nhoods(x)
nhood.adj <- nhoodAdjacency(x)
da.res <- milo_res

is.da
#' @importFrom igraph graph_from_adjacency_matrix components
.group_nhoods_from_adjacency <- function(nhs, nhood.adj, da.res, is.da,
                                         merge.discord=FALSE,
                                         lfc.threshold=NULL,
                                         overlap=1, subset.nhoods=NULL){

    if(is.null(names(nhs))){
        warning("No names attributed to nhoods. Converting indices to names")
        names(nhs) <- as.character(c(1:length(nhs)))
    }

    # assume order of nhs is the same as nhood.adj
    if(!is.null(subset.nhoods)){
        if(mode(subset.nhoods) %in% c("character", "logical", "numeric")){
            # force use of logicals for consistency
            if(mode(subset.nhoods) %in% c("character", "numeric")){
                sub.log <- names(nhs) %in% subset.nhoods
            } else{
                sub.log <- subset.nhoods
            }

            nhood.adj <- nhood.adj[sub.log, sub.log]

            if(length(is.da) == length(nhs)){
                nhs <- nhs[sub.log]
                is.da <- is.da[sub.log]
                da.res <- da.res[sub.log, ]
            } else{
                stop("Subsetting `is.da` vector length does not equal nhoods length")
            }
        } else{
            stop(paste0("Incorrect subsetting vector provided:", class(subset.nhoods)))
        }
    } else{
        if(length(is.da) != ncol(nhood.adj)){
            stop("Subsetting `is.da` vector length is not the same dimension as adjacency")
        }
    }

    ## check for concordant signs - assume order is the same as nhoods
    if(isFALSE(merge.discord)){
        nonz.nhs <- colSums(nhood.adj) > 0
        ll_names <- expand.grid(c(1:length(nhs[nonz.nhs])), c(1:length(nhs[nonz.nhs])))

        concord.sign <- sapply(1:nrow(ll_names), function(x) sign(da.res[nonz.nhs, ][as.numeric(ll_names[x, 1]), ]$logFC) ==
                                   sign(da.res[nonz.nhs,][as.numeric(ll_names[x, 2]), ]$logFC))
        pairs_int <- sapply(which(concord.sign), function(x) length(intersect(nhs[nonz.nhs][[ll_names[x, 1]]], nhs[nonz.nhs][[ll_names[x, 2]]])))
        lintersect <- rep(0, nrow(ll_names))
        lintersect[concord.sign] <- pairs_int

        ## Count as connected only nhoods with at least n shared cells
        lintersect_filt <- ifelse(!concord.sign, 0, lintersect)
        ll_names <- cbind(ll_names, lintersect_filt)

        nhood.adj[nonz.nhs, nonz.nhs] <- ll_names[, 3]
    }

    if(overlap > 1){
        # loop over adj dimensions and mask out cells with insufficient overlapping cells
        nonz.nhs <- colSums(nhood.adj) > 0
        ll_names <- expand.grid(c(1:length(nhs[nonz.nhs])), c(1:length(nhs[nonz.nhs])))

        keep_pairs <- sapply(1:nrow(ll_names) , function(x) any(nhs[nonz.nhs][[ll_names[x, 1]]] %in% nhs[nonz.nhs][[ll_names[x, 2]]]))
        pairs_int <- sapply(which(keep_pairs), function(x) length(intersect(nhs[nonz.nhs][[ll_names[x, 1]]], nhs[nonz.nhs][[ll_names[x, 2]]])))
        lintersect <- rep(0, nrow(ll_names))
        lintersect[keep_pairs] <- pairs_int

        ## Count as connected only nhoods with at least n shared cells
        lintersect_filt <- ifelse(lintersect < overlap, 0, lintersect)
        ll_names <- cbind(ll_names, lintersect_filt)

        nhood.adj[nonz.nhs, nonz.nhs] <- ll_names[, 3]
    }

    if(!is.null(lfc.threshold)){
        nonz.nhs <- colSums(nhood.adj) > 0
        ll_names <- expand.grid(c(1:length(nhs[nonz.nhs])), c(1:length(nhs[nonz.nhs])))

        # set adjacency to 0 for nhoods with lfc < threshold
        lfc.pass <- sapply(1:nrow(ll_names), function(x) (abs(da.res[nonz.nhs, ][as.numeric(ll_names[x, 1]), ]$logFC) >= lfc.threshold) &
                               (abs(da.res[nonz.nhs, ][as.numeric(ll_names[x, 2]), ]$logFC) >= lfc.threshold))
        pairs_int <- sapply(which(lfc.pass), function(x) length(intersect(nhs[nonz.nhs][[ll_names[x, 1]]], nhs[nonz.nhs][[ll_names[x, 2]]])))
        lintersect[lfc.pass] <- pairs_int

        ## Count as connected only nhoods with at least n shared cells
        lintersect_filt <- ifelse(!lfc.pass, 0, lintersect)
        ll_names <- cbind(ll_names, lintersect_filt)

        nhood.adj[nonz.nhs, nonz.nhs] <- ll_names[, 3]
    }

    # binarise
    nhood.adj <- as.matrix((nhood.adj > 0) + 0)

    n.dim <- ncol(nhood.adj)
    if(!isSymmetric(nhood.adj)){
        stop("Overlap matrix is not symmetric")
    }

    if(nrow(nhood.adj) != ncol(nhood.adj)){
        stop("Non-square distance matrix - check nhood subsetting")
    }

    g <- graph_from_adjacency_matrix(nhood.adj, mode="undirected", diag=FALSE)
    groups <- components(g)$membership

    # only keep the groups that contain >= 1 DA neighbourhoods
    keep.groups <- intersect(unique(groups[is.da]), unique(groups))

    return(groups[groups %in% keep.groups])
}


 .perform_counts_dge <- function(exprs.data, test.model, gene.offset=gene.offset,
                                 model.contrasts=NULL, n.coef=NULL){

     i.dge <- DGEList(counts=exprs.data,
                      lib.size=log(colSums(exprs.data)))

     if(isTRUE(gene.offset)){
         n.gene <- apply(exprs.data, 2, function(X) sum(X > 0))
         test.model <- cbind(test.model[, 1], n.gene, test.model[, c(2:ncol(test.model))])
         colnames(test.model) <- c(colnames(test.model)[1], "NGenes", colnames(test.model[, c(2:ncol(test.model))]))
     }

     i.dge <- estimateDisp(i.dge, test.model)
     i.fit <- glmQLFit(i.dge, test.model, robust=TRUE)

     if(!is.null(model.contrasts)){
         mod.constrast <- makeContrasts(contrasts=model.contrasts, levels=test.model)
         i.res <- as.data.frame(topTags(glmQLFTest(i.fit, contrast=mod.constrast),
                                        sort.by='none', n=Inf))
     } else{
         if(is.null(n.coef)){
             n.coef <- ncol(test.model)
         }
         i.res <- as.data.frame(topTags(glmQLFTest(i.fit, coef=n.coef), sort.by='none', n=Inf))
     }
     return(i.res)
 }

cell2nhoods <- function(x){
  nhs <- lapply(nhoods(x), function(nh) as.vector(nh))
  # nhood_mat <- matrix(nrow = ncol(x), ncol=length(nhoods(x)))
  nhood_mat <- sapply(nhs, function(nh) ifelse(1:ncol(x) %in% nh, 1, 0))
  return(nhood_mat)
}

x = liver_milo
da.res = milo_res
da.fdr=0.05
assay="counts"
overlap=1
lfc.threshold = 1
merge.discord=FALSE
subset.row = hvgs
gene.offset=FALSE
return.groups=FALSE
subset.nhoods = str_detect(milo_res$annotation_indepth, "Endo")
na.function="na.pass"
compute.new=FALSE
bits=FALSE

c2n <- cell2nhoods(x)

# nhood.adj1 <- t(c2n) %*% c2n 
# nhood.adj1 <- as(nhood.adj1, "dgCMatrix")
# 
# # nhood.adj.old <- nhoodAdjacency(liver_milo)
# nhoodAdjacency(liver_milo) <- as.matrix(nhood.adj1)
# 
# pheatmap::pheatmap(nhood.adj.old[rownames(nhood.adj1), rownames(nhood.adj1)] - nhood.adj1)

# ## Make groups
# c2n <- c2n[,subset.nhoods & is.da]
# da.res <- da.res[subset.nhoods & is.da,]
neg_logfc <- da.res$logFC < - lfc.threshold
pos_logfc <- da.res$logFC > lfc.threshold

gr1 <- rowSums(c2n[,subset.nhoods & is.da & neg_logfc]) > 0
gr2 <- rowSums(c2n[,subset.nhoods & is.da & pos_logfc]) > 0

# length((subset.nhoods) & (is.da) & (neg_logfc))
# g <- graph_from_adjacency_matrix(nhood.adj, mode="undirected", diag=FALSE)
#     groups <- components(g)$membership


fake.meta <- data.frame("CellID"=colnames(x), "Nhood.Group"=rep(NA, ncol(x)))
rownames(fake.meta) <- fake.meta$CellID
fake.meta["Nhood.Group"] <- ifelse(gr1 & !gr2, 1, ifelse(gr2 & !gr1, 2, NA))

umap_df['gr'] <- fake.meta[rownames(umap_df),"Nhood.Group"]
ggplot(umap_df, aes(UMAP_1, UMAP_2)) + geom_point(aes(color=gr))
```


```{r}
# for(i in seq_along(nhood.gr)){
#     nhood.x <- nhs.da.gr == nhood.gr[i]
#     # get the nhoods
#     nhs <- nhoods(x)
#     if(!is.null(subset.nhoods)){
#         nhs <- nhs[subset.nhoods]
#     }
# 
#     if(!any(is.na(fake.meta[unlist(nhs[nhood.x]),]$Nhood.Group))){
#         fake.meta[unlist(nhs[nhood.x]),]$Nhood.Group[!is.na(fake.meta[unlist(nhs[nhood.x]),]$Nhood.Group)] <- NA
#         } else{
#             fake.meta[unlist(nhs[nhood.x]),]$Nhood.Group <- nhood.gr[i]
#         }
# }
# only compare against the other DA neighbourhoods
x <- x[, !is.na(fake.meta$Nhood.Group)]
fake.meta <- fake.meta[!is.na(fake.meta$Nhood.Group), ]

if(!is.null(subset.row)){
    x <- x[subset.row, , drop=FALSE]
}

fake.meta[,"sample_id"] <- sapply(strsplit(fake.meta$CellID, "_"), function(x) paste(x[1], x[2], sep="_"))
fake.meta[,'sample_group'] <- paste(fake.meta[,"sample_id"], fake.meta[,"Nhood.Group"], sep="_")


```

```{r, echo=FALSE}
# 
# 
# if(is.null(na.function)){
#     warning("NULL passed to na.function, using na.pass")
#     na.func <- get("na.pass")
# } else{
#     tryCatch({
#         na.func <- get(na.function)
#     }, warning=function(warn){
#         warning(warn)
#     }, error=function(err){
#         stop(paste0("NA function ", na.function, " not recognised"))
#     }, finally={
#     })
# }
# 
# n.da <- sum(na.func(da.res$SpatialFDR < da.fdr))
# 
# if((ncol(nhoodAdjacency(x)) == length(nhoods(x))) & isFALSE(compute.new)){
#     message("nhoodAdjacency found - using for nhood grouping")
#     nhs.da.gr <- .group_nhoods_from_adjacency(nhs=nhoods(x),
#                                               nhood.adj=nhoodAdjacency(x),
#                                               da.res=da.res,
#                                               is.da=da.res$SpatialFDR < da.fdr,
#                                               lfc.threshold=lfc.threshold,
#                                               merge.discord=merge.discord,
#                                               overlap=overlap,
#                                               subset.nhoods=subset.nhoods)
# } else{
#     nhs.da.gr <- .group_nhoods_by_overlap(nhoods(x),
#                                           da.res=da.res,
#                                           is.da=da.res$SpatialFDR < da.fdr,
#                                           merge.discord=merge.discord,
#                                           overlap=overlap,
#                                           bits=bits,
#                                           cells=seq_len(ncol(x)),
#                                           subset.nhoods=subset.nhoods) # returns a vector group values for each nhood
#         # nhs.da.gr <- .group_nhoods_by_overlap(nhoods(x),
#         #                                       da.res=da.res,
#         #                                       is.da=da.res$SpatialFDR < da.fdr,
#         #                                       merge.discord=merge.discord,
#         #                                       
#         #                                       overlap=overlap,
#         #                                       bits=bits,
#         #                                       cells=seq_len(ncol(x)),
#         #                                       subset.nhoods=subset.nhoods)
#     }
# 
# nhood.gr <- unique(nhs.da.gr)
# # perform DGE _within_ each group of cells using the input design matrix
# message(paste0("Nhoods aggregated into ", length(nhood.gr), " groups"))
# 
# fake.meta <- data.frame("CellID"=colnames(x), "Nhood.Group"=rep(NA, ncol(x)))
# rownames(fake.meta) <- fake.meta$CellID
# 
# for(i in seq_along(nhood.gr)){
#     nhood.x <- nhs.da.gr == nhood.gr[i]
#     # get the nhoods
#     nhs <- nhoods(x)
#     if(!is.null(subset.nhoods)){
#         nhs <- nhs[subset.nhoods]
#     }
# 
#     if(!any(is.na(fake.meta[unlist(nhs[nhood.x]),]$Nhood.Group))){
#         fake.meta[unlist(nhs[nhood.x]),]$Nhood.Group[!is.na(fake.meta[unlist(nhs[nhood.x]),]$Nhood.Group)] <- NA
#         } else{
#             fake.meta[unlist(nhs[nhood.x]),]$Nhood.Group <- nhood.gr[i]
#         }
# }
# 
# # only compare against the other DA neighbourhoods
# x <- x[, !is.na(fake.meta$Nhood.Group)]
# fake.meta <- fake.meta[!is.na(fake.meta$Nhood.Group), ]
# 
# if(!is.null(subset.row)){
#     x <- x[subset.row, , drop=FALSE]
# }
# 
# fake.meta[,"sample_id"] <- sapply(strsplit(fake.meta$CellID, "_"), function(x) paste(x[1], x[2], sep="_"))
# fake.meta[,'sample_group'] <- paste(fake.meta[,"sample_id"], fake.meta[,"Nhood.Group"], sep="_")

sample_gr_mat <- matrix(0, nrow=nrow(fake.meta), ncol=length(unique(fake.meta$sample_group)))
colnames(sample_gr_mat) <- unique(fake.meta$sample_group)
rownames(sample_gr_mat) <- rownames(fake.meta)

for (s in colnames(sample_gr_mat)) {
  sample_gr_mat[which(fake.meta$sample_group == s),s] <- 1  
}

exprs <- assay(x, assay)

exprs_smp <- matrix(0, nrow=nrow(exprs), ncol=ncol(sample_gr_mat))
for (i in 1:ncol(sample_gr_mat)){
  if (sum(sample_gr_mat[,i]) > 1) {
    exprs_smp[,i] <- rowSums(exprs[,which(sample_gr_mat[,i] > 0)])  
  } else {
    exprs_smp[,i] <- exprs[,which(sample_gr_mat[,i] > 0)]
  }
}
rownames(exprs_smp) <- rownames(exprs)
colnames(exprs_smp) <- colnames(sample_gr_mat)

smp_meta <- distinct(fake.meta, sample_group, Nhood.Group)
rownames(smp_meta) <- smp_meta[,"sample_group"]

marker.list <- list()
i.contrast <- c("TestTest - TestRef") # always use contrasts for this

  # # if there is only 1 group, then need to make sure that all neighbourhoods
  # # are not in this group - otherwise can't do any DGE testing
  # if(length(nhood.gr) == 1){
  #     if(sum(fake.meta$Nhood.Group == nhood.gr[1]) == nrow(fake.meta)){
  #         warning("All graph neighbourhoods are in the same group - cannot perform DGE testing. Returning NULL")
  #         return(NULL)
  #     }
  # }

nhood.gr <- unique(fake.meta[,"Nhood.Group"])
  for(i in seq_along(nhood.gr)){
      i.meta <- smp_meta
      i.meta$Test <- "Ref"
      i.meta$Test[smp_meta$Nhood.Group == nhood.gr[i]] <- "Test"
      if(ncol(exprs_smp) > 1 & nrow(i.meta) > 1){
          i.design <- as.formula(" ~ 0 + Test")
          i.model <- model.matrix(i.design, data=i.meta)
          rownames(i.model) <- rownames(i.meta)
      }

      sink(file="/dev/null")
      gc()
      sink(file=NULL)

      if(assay == "logcounts"){
          i.res <- .perform_lognormal_dge(exprs_smp, i.model, model.contrasts=i.contrast,
                                          gene.offset=gene.offset)
      } else if(assay == "counts"){
          i.res <- .perform_counts_dge(exprs_smp, i.model, model.contrasts=i.contrast,
                                       gene.offset=gene.offset)
          colnames(i.res)[ncol(i.res)] <- "adj.P.Val"
      } else{
          warning("Assay type is not counts or logcounts - assuming (log)-normal distribution. Use these results at your peril")
          i.res <- .perform_lognormal_dge(exprs_smp, i.model,
                                          model.contrasts=i.contrast,
                                          gene.offset=gene.offset)
      }
      
      
      i.res$adj.P.Val[is.na(i.res$adj.P.Val)] <- 1
      i.res$logFC[is.infinite(i.res$logFC)] <- 0
      
      i.res <- i.res[, c("logFC", "adj.P.Val")]
      colnames(i.res) <- paste(colnames(i.res), nhood.gr[i], sep="_")
      marker.list[[paste0(nhood.gr[i])]] <- i.res
  }

  marker.df <- do.call(cbind.data.frame, marker.list)
  colnames(marker.df) <- gsub(colnames(marker.df), pattern="^[0-9]+\\.", replacement="")
  marker.df$GeneID <- rownames(i.res)
```


<!-- ```{r} -->
<!-- markers_df <- -->
<!--   findNhoodMarkers(x = liver_milo, -->
<!--   da.res = milo_res, -->
<!--   da.fdr=0.1, -->
<!--   assay="counts", -->
<!--   aggregate.samples = TRUE, sample_col = "sample_id", -->
<!--   overlap=1, -->
<!--   lfc.threshold = 0.1, -->
<!--   merge.discord=FALSE, -->
<!--   subset.row = hvgs, -->
<!--   gene.offset=FALSE, -->
<!--   return.groups=FALSE, -->
<!--   subset.nhoods = str_detect(milo_res$annotation_indepth, "Endo"), -->
<!--   na.function="na.pass", -->
<!--   compute.new=FALSE, -->
<!--   bits=FALSE) -->
<!-- ``` -->

#### Significant DGE genes (FDR 10%)

```{r}
marker.df[marker.df$adj.P.Val_1 < 0.05,]
```



<!-- ```{r} -->
<!-- x = liver_milo -->
<!-- da.res = milo_res -->
<!-- da.fdr=0.1 -->
<!-- assay="logcounts" -->
<!-- overlap=1 -->
<!-- lfc.threshold=0.1 -->
<!-- merge.discord=FALSE -->
<!-- subset.row = endo_hvgs -->
<!-- gene.offset=TRUE -->
<!-- return.groups=TRUE -->
<!-- subset.nhoods = str_detect(milo_res$annotation_indepth, "Endo") -->
<!-- na.function="na.pass" -->
<!-- compute.new=FALSE -->


<!-- if(!is(x, "Milo")){ -->
<!--     stop("Unrecognised input type - must be of class Milo") -->
<!-- } else if(any(!assay %in% assayNames(x))){ -->
<!--     stop(paste0("Unrecognised assay slot: ", assay)) -->
<!-- } -->

<!-- if(is.null(na.function)){ -->
<!--     warning("NULL passed to na.function, using na.pass") -->
<!--     na.func <- get("na.pass") -->
<!-- } else{ -->
<!--     tryCatch({ -->
<!--         na.func <- get(na.function) -->
<!--     }, warning=function(warn){ -->
<!--         warning(warn) -->
<!--     }, error=function(err){ -->
<!--         stop(paste0("NA function ", na.function, " not recognised")) -->
<!--     }, finally={ -->
<!--     }) -->
<!-- } -->

<!-- n.da <- sum(na.func(da.res$SpatialFDR < da.fdr)) -->
<!-- if(!is.na(n.da) & n.da == 0){ -->
<!--     stop("No DA neighbourhoods found") -->
<!-- } -->

<!-- if(any(is.na(da.res$SpatialFDR))){ -->
<!--     warning("NA values found in SpatialFDR vector") -->
<!-- } -->

<!-- message(paste0("Found ", n.da, " DA neighbourhoods at FDR ", da.fdr*100, "%")) -->
<!-- ``` -->

<!-- ```{r} -->
<!-- nhs = nhoods(x) -->
<!-- nhood.adj = nhoodAdjacency(x) -->
<!-- da.res = milo_res -->
<!-- is.da = da.res$SpatialFDR < da.fdr -->
<!--     # # assume order of nhs is the same as nhood.adj -->
<!--     # if(!is.null(subset.nhoods)){ -->
<!--     #     if(mode(subset.nhoods) %in% c("character", "logical", "numeric")){ -->
<!--     #         # force use of logicals for consistency -->
<!--     #         if(mode(subset.nhoods) %in% c("character", "numeric")){ -->
<!--     #             sub.log <- names(nhs) %in% subset.nhoods -->
<!--     #         } else{ -->
<!--     #             sub.log <- subset.nhoods -->
<!--     #         } -->
<!--     #  -->
<!--     #         nhood.adj <- nhood.adj[sub.log, sub.log] -->
<!--     #  -->
<!--     #         if(length(is.da) == length(nhs)){ -->
<!--     #             nhs <- nhs[sub.log] -->
<!--     #             is.da <- is.da[sub.log] -->
<!--     #             da.res <- da.res[sub.log, ] -->
<!--     #         } else{ -->
<!--     #             stop("Subsetting `is.da` vector length does not equal nhoods length") -->
<!--     #         } -->
<!--     #     } else{ -->
<!--     #         stop(paste0("Incorrect subsetting vector provided:", class(subset.nhoods))) -->
<!--     #     } -->
<!--     # } else{ -->
<!--     #     if(length(is.da) != ncol(nhood.adj)){ -->
<!--     #         stop("Subsetting `is.da` vector length is not the same dimension as adjacency") -->
<!--     #     } -->
<!--     # } -->
<!--     #  -->
<!--     #  -->
<!--     # ## check for concordant signs - assume order is the same as nhoods -->
<!--     # if(isFALSE(merge.discord)){ -->
<!--     #     nonz.nhs <- colSums(nhood.adj) > 0 -->
<!--     #     ll_names <- expand.grid(c(1:length(nhs[nonz.nhs])), c(1:length(nhs[nonz.nhs]))) -->
<!--     #     ll_names["adj"] <- sapply(1:nrow(ll_names), function(x) nhood.adj[ll_names[x,1],ll_names[x,2]]) -->
<!--     #     nonz.pairs <- ll_names[,"adj"] > 0 -->
<!--     #      -->
<!--     #     concord.sign <- sapply((1:nrow(ll_names))[nonz.pairs], function(x) sign(da.res[nonz.nhs, ][as.numeric(ll_names[x, 1]), ]$logFC) == -->
<!--     #                                sign(da.res[nonz.nhs,][as.numeric(ll_names[x, 2]), ]$logFC)) -->
<!--     #  -->
<!--     #     ll_names[,"adj"][nonz.pairs][!concord.sign] <- 0 -->
<!--     #     nhood.adj[nonz.nhs, nonz.nhs] <- ll_names[, "adj"] -->
<!--     # } -->
<!--     #  -->
<!--     # if(overlap > 1){ -->
<!--     #     # loop over adj dimensions and mask out cells with insufficient overlapping cells -->
<!--     #     nonz.nhs <- colSums(nhood.adj) > 0 -->
<!--     #     ll_names <- expand.grid(c(1:length(nhs[nonz.nhs])), c(1:length(nhs[nonz.nhs]))) -->
<!--     #     ll_names["adj"] <- sapply(1:nrow(ll_names), function(x) nhood.adj[ll_names[x,1],ll_names[x,2]]) -->
<!--     #     nonz.pairs <- ll_names[,"adj"] > 0 -->
<!--     #     ll_names[nonz.pairs,"adj"] <- ifelse(ll_names[nonz.pairs,"adj"] < overlap, 0, ll_names[nonz.pairs,"adj"] ) -->
<!--     #     nhood.adj[nonz.nhs, nonz.nhs] <- ll_names[, "adj"] -->
<!--     # } -->
<!--     #  -->
<!--     # if(!is.null(lfc.threshold)){ -->
<!--     #     nonz.nhs <- colSums(nhood.adj) > 0 -->
<!--     #     ll_names <- expand.grid(c(1:length(nhs[nonz.nhs])), c(1:length(nhs[nonz.nhs]))) -->
<!--     #     ll_names["adj"] <- sapply(1:nrow(ll_names), function(x) nhood.adj[ll_names[x,1],ll_names[x,2]]) -->
<!--     #     nonz.pairs <- ll_names[,"adj"] > 0 -->
<!--     #      -->
<!--     #     # set adjacency to 0 for nhoods with lfc < threshold -->
<!--     #     lfc.pass <- sapply((1:nrow(ll_names))[nonz.pairs], function(x) (abs(da.res[nonz.nhs, ][as.numeric(ll_names[x, 1]), ]$logFC) >= lfc.threshold) & -->
<!--     #                            (abs(da.res[nonz.nhs, ][as.numeric(ll_names[x, 2]), ]$logFC) >= lfc.threshold)) -->
<!--     #      -->
<!--     #     ll_names[,"adj"][nonz.pairs][!lfc.pass] <- 0 -->
<!--     #     nhood.adj[nonz.nhs, nonz.nhs] <- ll_names[, "adj"] -->
<!--     # } -->
<!--     #  -->
<!--     # # binarise -->
<!--     # nhood.adj <- as.matrix((nhood.adj > 0) + 0) -->
<!--     #  -->
<!--     # n.dim <- ncol(nhood.adj) -->
<!--     # if(!isSymmetric(nhood.adj)){ -->
<!--     #     stop("Overlap matrix is not symmetric") -->
<!--     # } -->
<!--     #  -->
<!--     # if(nrow(nhood.adj) != ncol(nhood.adj)){ -->
<!--     #     stop("Non-square distance matrix - check nhood subsetting") -->
<!--     # } -->
<!--     #  -->
<!--     # g <- graph_from_adjacency_matrix(nhood.adj, mode="undirected", diag=FALSE) -->
<!--     # groups <- components(g)$membership -->
<!--     #  -->
<!--     # # only keep the groups that contain >= 1 DA neighbourhoods -->
<!--     # keep.groups <- intersect(unique(groups[is.da]), unique(groups)) -->
<!-- .group_nhoods_from_adjacency <- function(x, da.res, is.da, -->
<!--                                          merge.discord=FALSE, -->
<!--                                          lfc.threshold=NULL, -->
<!--                                          overlap=1, subset.nhoods=NULL){ -->

<!--     if(is.null(names(nhs))){ -->
<!--         warning("No names attributed to nhoods. Converting indices to names") -->
<!--         names(nhs) <- as.character(c(1:length(nhs))) -->
<!--     } -->

<!--   nh_graph <- nhoodGraph(x) -->
<!--   ## Change names from nhoodIndex to nhoodNumber -->
<!--   V(nh_graph)$name <- match(V(nh_graph)$name, unlist(nhoodIndex(x))) -->

<!--   max(as.numeric(V(nh_graph)$name)) -->

<!--   if(!is.null(subset.nhoods)){ -->
<!--     da.res <- da.res[subset.nhoods,]   -->
<!--   } -->

<!--   keep.nhoods <- da.res$SpatialFDR < da.fdr &  -->
<!--     abs(da.res$logFC) > lfc.threshold -->

<!-- # unlist(nhoodIndex(x)[subset.nhoods]) -->
<!-- # as.numeric(V(nh_graph)$name) %in% unlist(nhoodIndex(x)[subset.nhoods]) & -->
<!-- #   (da.res$SpatialFDR < da.fdr) -->

<!--   nh_graph <-  -->
<!--     igraph::induced_subgraph(nh_graph,  -->
<!--                              vids = as.character(da.res$Nhood[keep.nhoods]) -->
<!--                              # vids = which(as.numeric(V(nh_graph)$name) %in% unlist(nhoodIndex(x)[da.res[keep.nhoods,"Nhood"]])) -->
<!--                              ) -->
<!--   groups <- components(nh_graph)$membership -->
<!--   keep.groups <- intersect(unique(groups[is.da]), unique(groups)) -->



<!--   return(groups[groups %in% keep.groups]) -->
<!-- } -->


<!-- da.res %>% -->
<!--   mutate(keep=keep.nhoods) %>% -->
<!--   ggplot(aes(logFC, -log10(SpatialFDR), color=keep)) + -->
<!--   geom_point() -->
<!-- ggraph(nh_graph) + geom_edge_link2() -->
<!-- ``` -->


<!-- ```{r} -->
<!-- lfc.threshold = 0 -->
<!-- if(!is.null(nhoodAdjacency(x)) & isFALSE(compute.new)){ -->
<!--     message("nhoodAdjacency found - using for nhood grouping") -->
<!--     nhs.da.gr <- .group_nhoods_from_adjacency(x, -->
<!--                                               da.res=milo_res, -->
<!--                                               is.da=da.res$SpatialFDR < da.fdr, -->
<!--                                               subset.nhoods = subset.nhoods, -->
<!--                                               merge.discord=merge.discord, -->
<!--                                               overlap=overlap, -->
<!--                                               lfc.threshold=lfc.threshold -->
<!--                                               ) -->
<!-- } else{ -->
<!--     nhs.da.gr <- .group_nhoods_by_overlap(nhoods(x), -->
<!--                                           da.res=da.res, -->
<!--                                           is.da=da.res$SpatialFDR < da.fdr, -->
<!--                                           merge.discord=merge.discord, -->
<!--                                           overlap=overlap, -->
<!--                                           ) # returns a vector group values for each nhood -->
<!-- } -->

<!-- nhood.gr <- unique(nhs.da.gr) -->

<!-- names(nhs.da.gr) -->
<!-- # names(nhs.da.gr) <- match(names(nhs.da.gr), nhoodIndex(x)) -->
<!-- nhs.da.gr.all <- rep(NA, length(nhoods(x))) -->

<!-- nhs.da.gr.all[as.numeric(names(nhs.da.gr))] <- nhs.da.gr -->


<!-- # perform DGE _within_ each group of cells using the input design matrix -->
<!-- message(paste0("Nhoods aggregated into ", length(nhood.gr), " groups")) -->
<!-- ``` -->

<!-- ```{r} -->
<!-- cell2nhoods <- function(x){ -->
<!--   nhs <- lapply(nhoods(x), function(nh) as.vector(nh)) -->
<!--   # nhood_mat <- matrix(nrow = ncol(x), ncol=length(nhoods(x))) -->
<!--   nhood_mat <- sapply(nhs, function(nh) ifelse(1:ncol(x) %in% nh, 1, 0)) -->
<!--   return(nhood_mat) -->
<!-- } -->
<!-- ``` -->


<!-- ```{r} -->
<!-- x <- liver_milo -->
<!-- nh_mat <- cell2nhoods(x) -->
<!-- fake.meta <- data.frame("CellID"=colnames(x)) -->
<!-- # rownames(fake.meta) <- fake.meta$CellID -->
<!-- nhs.da.gr.all <- ifelse(gr1, 1, ifelse(gr2, 2, NA)) -->

<!-- dim(nh_mat) -->
<!-- nhood.gr <- unique(nhood.da.gr.all[!is.na(nhood.da.gr.all)]) -->
<!-- for(i in seq_along(nhood.gr)){ -->
<!--   gr.cells <- rowSums(nh_mat[which(nhs.da.gr.all == nhood.gr[i]),]) > 0 -->
<!--   fake.meta[,paste0("group_", i)] <- gr.cells -->
<!-- } -->
<!-- # dim(fake.meta[rowSums(nh_mat[,which(nhs.da.gr.all == nhood.gr[i])]) > 0,]) -->
<!-- # # nhs.da.gr.all <- ifelse(1:length(nhoods(x)) %in% match(names(nhs.da.gr), nhoodIndex(x)), nhs.da.gr, NA) -->
<!-- # for(i in seq_along(nhood.gr)){ -->
<!-- #     nhood.x <- nhs.da.gr.all == nhood.gr[i] -->
<!-- #     nhood.x[is.na(nhood.x)] <- FALSE -->
<!-- #     fake.meta[unlist(nhoods(x)[da.res$SpatialFDR < da.fdr][nhood.x]), paste0("group_", i)] <- nhood.gr[i] -->
<!-- # } -->


<!-- fake.meta <- -->
<!--   fake.meta %>% -->
<!--   mutate(Nhood.Group = ifelse(group_1, 1, ifelse(group_2, 2, NA))) -->
<!-- rownames(fake.meta) <- fake.meta$CellID -->

<!-- colData(endo_milo)[["group_1"]] <- as.factor(fake.meta[colnames(endo_milo),"group_1"]) -->
<!-- colData(endo_milo)[["group_2"]] <- as.factor(fake.meta[colnames(endo_milo),"group_2"]) -->
<!-- colData(endo_milo)[["NhoodGroup"]] <- as.factor(fake.meta[colnames(endo_milo),"NhoodGroup"]) -->
<!-- plotUMAP(endo_milo, colour_by="NhoodGroup") -->

<!-- ``` -->



<!-- ```{r} -->

<!-- # only compare against the other DA neighbourhoods -->
<!-- x <- x[, !is.na(fake.meta$Nhood.Group)] -->
<!-- fake.meta <- fake.meta[!is.na(fake.meta$Nhood.Group), ] -->

<!-- if(!is.null(subset.row)){ -->
<!--     x <- x[subset.row, , drop=FALSE] -->
<!-- } -->

<!--     exprs <- assay(x, assay) -->

<!--     marker.list <- list() -->
<!--     i.contrast <- c("TestTest - TestRef") # always use contrasts for this -->

<!--     # if there is only 1 group, then need to make sure that all neighbourhoods -->
<!--     # are not in this group - otherwise can't do any DGE testing -->
<!--     if(length(nhood.gr) == 1){ -->
<!--         if(sum(fake.meta$Nhood.Group == nhood.gr[1]) == nrow(fake.meta)){ -->
<!--             warning("All graph neighbourhoods are in the same group - cannot perform DGE testing. Returning NULL") -->
<!--             return(NULL) -->
<!--         } -->
<!--     } -->

<!--     for(i in seq_along(nhood.gr)){ -->
<!--         i.meta <- fake.meta -->
<!--         i.meta$Test <- "Ref" -->
<!--         i.meta$Test[fake.meta$Nhood.Group == nhood.gr[i]] <- "Test" -->

<!--         if(ncol(exprs) > 1 & nrow(i.meta) > 1){ -->
<!--             i.design <- as.formula(" ~ 0 + Test") -->
<!--             i.model <- model.matrix(i.design, data=i.meta) -->
<!--             rownames(i.model) <- rownames(i.meta) -->
<!--         } -->

<!--         sink(file="/dev/null") -->
<!--         gc() -->
<!--         sink(file=NULL) -->

<!--         if(assay == "logcounts"){ -->
<!--             i.res <- .perform_lognormal_dge(exprs, i.model, model.contrasts=i.contrast, -->
<!--                                             gene.offset=gene.offset) -->
<!--         } else if(assay == "counts"){ -->
<!--             i.res <- .perform_counts_dge(exprs, i.model, model.contrasts=i.contrast, -->
<!--                                          gene.offset=gene.offset) -->
<!--         } else{ -->
<!--             warning("Assay type is not counts or logcounts - assuming (log)-normal distribution. Use these results at your peril") -->
<!--             i.res <- .perform_lognormal_dge(exprs, i.model, -->
<!--                                             model.contrasts=i.contrast, -->
<!--                                             gene.offset=gene.offset) -->
<!--         } -->

<!--         i.res$adj.P.Val[is.na(i.res$adj.P.Val)] <- 1 -->
<!--         i.res$logFC[is.infinite(i.res$logFC)] <- 0 -->

<!--         i.res <- i.res[, c("logFC", "adj.P.Val")] -->
<!--         colnames(i.res) <- paste(colnames(i.res), nhood.gr[i], sep="_") -->
<!--         marker.list[[paste0(nhood.gr[i])]] <- i.res -->
<!--     } -->

<!--     marker.df <- do.call(cbind.data.frame, marker.list) -->
<!--     colnames(marker.df) <- gsub(colnames(marker.df), pattern="^[0-9]+\\.", replacement="") -->
<!--     marker.df$GeneID <- rownames(i.res) -->

<!--     if(isTRUE(return.groups)){ -->
<!--         out.list <- list("groups"=fake.meta, "dge"=marker.df) -->
<!--         return(out.list) -->
<!--     }else{ -->
<!--         return(marker.df) -->
<!--     } -->
<!-- nhood_markers <- out.list -->
<!-- sum(marker.df$adj.P.Val_1 < 0.001) -->

<!-- bcell_genes <- c('MS4A1', "CD19", "FCGR2B", "IGHA1") -->

<!-- plotUMAP(endo_milo, colour_by=bcell_genes[3]) -->

<!-- ``` -->

#### Visualize as volcano 

```{r, fig.height=6, fig.width=10, message=FALSE, warning=FALSE}
marker.df %>%
   mutate(adj.P.Val_1 = ifelse(- log10(adj.P.Val_1) > 300, 1e-300, adj.P.Val_1)) %>%
  mutate(label=ifelse((adj.P.Val_1) < 0.05, GeneID, NA)) %>%
  ggplot(aes(logFC_2, -log10(adj.P.Val_1), 
             # color=highlight
             )) + 
  geom_point() +
  ggrepel::geom_text_repel(aes(label=label)) +
  xlab("logFC") + ylab("- log10(Adj. P value)") +
  theme_bw(base_size = 22)
  
  
```

#### Visualize as heatmap 
(gene expression values are scaled between 0 and 1 for each gene)

```{r, fig.height=14, fig.width=9, message=FALSE, warning=FALSE}
marker_genes <- marker.df %>%
  dplyr::filter(adj.P.Val_1 < 0.05) %>%
  pull(GeneID)

highlight_genes <- c("PLVAP", "SOX14", "VWA1", "ACKR1", "IL32",
                     "CLEC4G", "CLEC4M", "STAB2", "MRC1", "LYVE1",
                     "CD14", "SOX17", "WNT2", "RSPO3", "AIF1L",
                     "PROX1", "PDPN","CPE", "CD320", "BMPER")

fig4_bbright <- plotNhoodExpressionDA(liver_milo, milo_res, marker_genes, cluster_features = TRUE, 
                      alpha = 0.1,
                      scale_to_1 = TRUE,
                      subset.nhoods = milo_res$annotation_lineage=="Endothelia",
                      highlight_features = highlight_genes, show_rownames = TRUE
                      ) +
  ylab("DE genes") +
   theme(legend.text = element_text(size=22), legend.title = element_text(size=24)) +
  plot_layout(heights = c(1,10))

fig4_bbright
```

<!-- Customize plotting for paper figure -->

<!-- ```{r} -->
<!-- x <- liver_milo -->
<!-- da.res <- milo_res -->

<!-- cluster_features = TRUE -->
<!-- alpha = 0.1 -->
<!-- scale_to_1 = TRUE -->
<!-- subset.nhoods = milo_res$annotation_lineage=="Endothelia" -->
<!-- features <- marker_genes -->

<!-- expr_mat <- nhoodExpression(x)[features,] -->
<!-- colnames(expr_mat) <- 1:length(nhoods(x)) -->

<!-- ## Get nhood expression matrix -->
<!-- if (!is.null(subset.nhoods)) { -->
<!--   expr_mat <- expr_mat[,subset.nhoods, drop=FALSE] -->
<!-- } -->

<!-- if (!isFALSE(scale_to_1)) { -->
<!--   expr_mat <- t(apply(expr_mat, 1, function(x) (x - min(x))/(max(x)- min(x)))) -->
<!-- } -->

<!-- rownames(expr_mat) <- sub(pattern = "-", replacement = ".", rownames(expr_mat)) ## To avoid problems when converting to data.frame -->

<!-- pl_df <- data.frame(t(expr_mat)) %>% -->
<!--   rownames_to_column("Nhood") %>% -->
<!--   mutate(Nhood=as.double(Nhood)) %>% -->
<!--   left_join(da.res, by="Nhood") %>% -->
<!--   mutate(logFC_rank=percent_rank(logFC)) -->

<!-- ## Top plot: nhoods ranked by DA log FC -->
<!-- pl_top <- pl_df %>% -->
<!--   mutate(is_signif = ifelse(SpatialFDR < alpha, paste0("SpatialFDR < ", alpha), NA)) %>% -->
<!--   ggplot(aes(logFC_rank, logFC)) + -->
<!--   geom_hline(yintercept = 0, linetype=2) + -->
<!--   geom_point(size=0.2, color="grey") + -->
<!--   geom_point(data=. %>% dplyr::filter(!is.na(is_signif)), aes(color=is_signif), size=1) + -->
<!--   theme_bw(base_size=22) + -->
<!--   ylab("DA logFC") + -->
<!--   scale_color_manual(values="red", name="") + -->
<!--   guides(color=guide_legend(override.aes = list(size=2))) + -->
<!--   scale_x_continuous(expand = c(0.01, 0)) + -->
<!--   theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.title.x = element_blank()) -->

<!-- ## Bottom plot: gene expression heatmap -->
<!-- if (isTRUE(cluster_features)) { -->
<!--   row.order <- hclust(dist(expr_mat))$order # clustering -->
<!--   ordered_features <- rownames(expr_mat)[row.order] -->
<!-- } else { -->
<!--   ordered_features <- rownames(expr_mat) -->
<!-- } -->


<!-- ``` -->


<!-- ```{r, fig.height=12, fig.width=10} -->
<!-- # detach('package:EnsDb.Hsapiens.v86') -->
<!-- # detach('package:ensembldb') -->

<!-- highlight_genes <- c("PLVAP", "SOX14", "VWA1", "ACKR1", -->
<!--                      "CLEC4G", "CLEC4M", "STAB2", "MRC1", -->
<!--                      "CD14", "CCL21", "SOX17", "WNT2", "RSPO3", "AIF1L", -->
<!--                      "PROX1", "PDPN","CPE", "CD320") -->

<!-- pl_bottom <- pl_df %>% -->
<!--   pivot_longer(cols=rownames(expr_mat), names_to='feature', values_to="avg_expr") %>% -->
<!--   mutate(feature=factor(feature, levels=ordered_features)) %>% -->
<!--   mutate(label=ifelse(feature %in% highlight_genes, as.character(feature), NA)) %>% -->
<!--   ggplot(aes(logFC_rank, feature, fill=avg_expr)) + -->
<!--   geom_tile() + -->
<!--   scale_fill_viridis_c(option="magma", name="Scaled \nexpression") + -->
<!--   xlab("Endothelial Neighbourhoods") + ylab("DE genes") + -->
<!--   scale_x_continuous(expand = c(0.01, 0), -->
<!--                      # limits = c(0, max(pl_df$logFC_rank) + 0.2) -->
<!--                      ) + -->
<!--   scale_y_discrete(expand = c(0.04,0)) + -->
<!--   coord_cartesian(clip="off") + -->
<!--   ggrepel::geom_text_repel(data=. %>% dplyr::filter(!is.na(label)) %>% -->
<!--               group_by(label) %>% -->
<!--               summarise(logFC_rank=max(logFC_rank), avg_expr=mean(avg_expr), feature=dplyr::first(feature)), -->
<!--             aes(label=label, x=logFC_rank), size=6,  -->
<!--             xlim = c(max(pl_df$logFC_rank) + 0.05, max(pl_df$logFC_rank) + 0.15), -->
<!--             # ylim = c(min(pl_df$)) -->
<!--             hjust=1, -->
<!--             segment.curvature = -0.1, -->
<!--     segment.ncp = 3, -->
<!--     segment.angle = 20, -->
<!--             min.segment.length = 0, seed=42 -->
<!--             ) + -->
<!--   theme_classic(base_size = 22) + -->
<!--   theme(axis.text.x = element_blank(), axis.line.x = element_blank(), axis.ticks.x = element_blank(), -->
<!--         axis.text.y = element_blank(), axis.line.y = element_blank(), axis.ticks.y = element_blank(), -->
<!--         panel.spacing = margin(2, 2, 2, 2, "cm"), -->
<!--         legend.margin=margin(0,0,0,0), -->
<!--         legend.box.margin=margin(10,10,10,10) -->
<!--         ) -->

<!-- ## Assemble plot -->
<!-- fig4_bbright <-  (pl_top / pl_bottom) + plot_layout(heights = c(1,4), guides = "collect") &  -->
<!--     theme(legend.justification=c(0, 1)) -->
<!-- fig4_bbright +  ggsave("~/milo_output/liver_DEG_heatmap.pdf", width=9, height = 9) -->

<!-- ``` -->

### GO term analysis

```{r, echo=FALSE, warning=FALSE, message=FALSE}
# BiocManager::install('clusterProfiler')
# BiocManager::install('msigdbr')
library(clusterProfiler)
library(msigdbr)

m_df <- msigdbr(species = "Homo sapiens")
m_t2g <- msigdbr(species = "Homo sapiens", category = "C5", subcategory = "BP")  %>% 
  dplyr::select(gs_name, gene_symbol)

marker_genes_up <- marker.df %>%
  dplyr::filter(adj.P.Val_1 < 0.05 & logFC_1 > 0) %>%
  pull(GeneID)

marker_genes_down <- marker.df %>%
  dplyr::filter(adj.P.Val_1 < 0.05 & logFC_1 < 0) %>%
  pull(GeneID)

em_up <- enricher(marker_genes_up, TERM2GENE=m_t2g, pAdjustMethod = "fdr", 
                  universe = rownames(liver_milo)
                  )
em_down <- enricher(marker_genes_down, TERM2GENE=m_t2g, pAdjustMethod = "fdr", 
                    universe = rownames(liver_milo)
                    )

em_res_up <- em_up@result[em_up@result$qvalue < 0.1,] %>%
  dplyr::select(- c(Description))
em_res_down <- em_down@result[em_down@result$qvalue < 0.1,] %>%
  dplyr::select(- c(Description))
```

```{r, fig.height=8, fig.width=15, warning=FALSE, message=FALSE}
em_res_up %>%
  top_n(20, -log10(qvalue)) %>%
  mutate(Term=factor(ID, levels=rev(unique(ID)))) %>%
  ggplot(aes(Term, -log10(qvalue))) +
  geom_point() +
  coord_flip() +
  xlab("GO Biological Function") + ylab("-log10(Adj. p-value)") +
  theme_bw(base_size=14) +
  ggtitle("Markers of cirrhotic")

em_res_down %>%
  top_n(30, -log10(qvalue)) %>%
  mutate(Term=factor(ID, levels=rev(unique(ID)))) %>%
  ggplot(aes(Term, -log10(qvalue))) +
  geom_point() +
  coord_flip() +
  xlab("GO Biological Function") + ylab("-log10(Adj. p-value)") +
  theme_bw(base_size=14) +
  ggtitle("Markers of healthy")
```

### Full GO enrichment - up in cirrhotic
(`geneID` column indicates which genes belong to gene ontology)
```{r}
em_res_up
```

### Full GO enrichment - up in healty
(`geneID` column indicates which genes belong to gene ontology)
```{r}
em_res_down
```


## Close-up on Cholangiocytes

<!-- ```{r} -->
<!-- chol_milo <- scater::runUMAP(liver_milo[,liver_milo$annotation_lineage=="Cholangiocytes"],  dimred='PCA') -->
<!-- plotUMAP(chol_milo, colour_by = "annotation_indepth") -->
<!-- ``` -->

Filter out cells that show contamination from immune cells (expression of immune markers)

```{r}
keep <- logcounts(chol_milo)["CD19",] == 0 | logcounts(chol_milo)["MS4A1",] == 0
chol_milo <- chol_milo[,keep]
chol_milo <- scater::runUMAP(chol_milo,  dimred='PCA')

plotUMAP(chol_milo, colour_by = "annotation_indepth")
```

```{r, fig.width=10, fig.height=8}
umap_df <- data.frame(reducedDim(chol_milo, "UMAP"))
colnames(umap_df) <- c("UMAP_1", "UMAP_2")

chol_umap <- cbind(umap_df, condition=chol_milo$condition) %>%
   ggplot(aes(UMAP_1, UMAP_2, color=condition)) +
  geom_point(size=0.3, alpha=0.5) +
  scale_color_brewer(palette="Set1", name='') +
  xlab("UMAP1") + ylab("UMAP2") +
  coord_fixed() +
  guides(color="none") +
  facet_wrap(condition~., ncol=1) +
  theme_nothing() +
  theme(axis.text = element_blank(), axis.ticks = element_blank(), legend.position=c(0.9,0.9),
        strip.background = element_rect(color=NA), strip.text = element_text(size=22))

chol_umap
```

```{r, fig.width=8, fig.height=4, warning=FALSE, message=FALSE}
liver_milo2 <- liver_milo
subset.nhoods <- str_detect(milo_res$annotation_indepth, "Cholangio")
reducedDim(liver_milo2, "UMAP")[colnames(chol_milo),] <- reducedDim(chol_milo, "UMAP") 

chol_gr <-
  plotNhoodGraphDA(
  liver_milo2, milo_res,
  subset.nhoods = subset.nhoods, 
  # ) =)[1:(length()-1)], 
  alpha = 0.1
  )  +
   theme(legend.text = element_text(size=22), legend.title = element_text(size=24))
  
(chol_umap + chol_gr ) + 
  plot_layout(widths = c(1,2), 
                guides = "collect"
                )
# fig4_bright1 +
#   ggsave("~/milo_output/liver_endoGraph.pdf", width=9, height = 5)  

```

### Differential expression between DA neighbourhoods

We merge overlapping nhoods with significant DA and the same direction of logFC, and then test for differential expression between cells in disease-specific and healthy-specific nhoods. This allows us to perform a comparison without further clustering.

<!-- ds -->
<!-- ```{r} -->
<!-- # dec_liver <- modelGeneVar(liver_milo) -->
<!-- # fit_liver <- metadata(dec_liver) -->
<!-- # hvgs <- getTopHVGs(dec_liver, n=3000) -->
<!-- #  -->
<!-- # ## keep HV genes expressed in at least 5% of endothelial cells -->
<!-- # endo_hvg_cnts <- counts(liver_milo)[hvgs,liver_milo$annotation_lineage=="Endothelia"] -->
<!-- # endo_hvgs <- hvgs[(rowSums(endo_hvg_cnts!=0) > (ncol(endo_hvg_cnts)/100)*5) & -->
<!-- #                    (rowSums(endo_hvg_cnts!=0) < (ncol(endo_hvg_cnts)/100)*70)] -->
<!-- # # & -->
<!-- # #                     rowMeans(endo_hvg_cnts) < 5]  ## Remove very highly expressed -->
<!-- #  -->
<!-- # endo_hvgs <- scan("~/endo_hvgs.txt", what = "") -->
<!-- ``` -->

<!-- ```{r} -->
<!-- # liver_milo <- calcNhoodExpression(liver_milo, subset.row = hvgs) -->

<!-- nhood_markers <- findNhoodMarkers(liver_milo, milo_res, overlap=1, return.groups = TRUE, -->
<!--                                   subset.row = hvgs, -->
<!--                                   lfc.threshold = 0.1, -->
<!--                                   subset.nhoods = str_detect(milo_res$annotation_indepth, "Endo"), -->
<!--                                   compute.new = FALSE) -->


<!-- sum(nhood_markers$dge$adj.P.Val_1 < 0.05) -->
<!-- nhood_markers$dge[nhood_markers$dge$GeneID %in% highlight_genes,] -->
<!-- ``` -->

Here we find markers grouping gene expression counts by samples (i.e. we don't treat cells as replicates, but exploit the replication structure used also for DA testing)

```{r, echo=FALSE}
x = liver_milo
da.res = milo_res
da.fdr=0.1
assay="counts"
overlap=1
lfc.threshold = 0.1
merge.discord=FALSE
subset.row = hvgs
gene.offset=FALSE
return.groups=FALSE
subset.nhoods = str_detect(milo_res$annotation_indepth, "Chol")
na.function="na.pass"
compute.new=FALSE
bits=FALSE
aggregate.samples=TRUE


if(is.null(na.function)){
    warning("NULL passed to na.function, using na.pass")
    na.func <- get("na.pass")
} else{
    tryCatch({
        na.func <- get(na.function)
    }, warning=function(warn){
        warning(warn)
    }, error=function(err){
        stop(paste0("NA function ", na.function, " not recognised"))
    }, finally={
    })
}

n.da <- sum(na.func(da.res$SpatialFDR < da.fdr))

if((ncol(nhoodAdjacency(x)) == length(nhoods(x))) & isFALSE(compute.new)){
    message("nhoodAdjacency found - using for nhood grouping")
    nhs.da.gr <- .group_nhoods_from_adjacency(nhoods(x),
                                              nhood.adj=nhoodAdjacency(x),
                                              da.res=da.res,
                                              is.da=da.res$SpatialFDR < da.fdr,
                                              merge.discord=merge.discord,
                                              overlap=overlap,
                                              subset.nhoods=subset.nhoods)
} else{
    nhs.da.gr <- .group_nhoods_by_overlap(nhoods(x),
                                          da.res=da.res,
                                          is.da=da.res$SpatialFDR < da.fdr,
                                          merge.discord=merge.discord,
                                          overlap=overlap,
                                          bits=bits,
                                          cells=seq_len(ncol(x)),
                                          subset.nhoods=subset.nhoods) # returns a vector group values for each nhood
}

nhood.gr <- unique(nhs.da.gr)
# perform DGE _within_ each group of cells using the input design matrix
message(paste0("Nhoods aggregated into ", length(nhood.gr), " groups"))

fake.meta <- data.frame("CellID"=colnames(x), "Nhood.Group"=rep(NA, ncol(x)), "sample_id" = colData(x)[[samples]])
rownames(fake.meta) <- fake.meta$CellID

for(i in seq_along(nhood.gr)){
    nhood.x <- nhs.da.gr == nhood.gr[i]
    # get the nhoods
    nhs <- nhoods(x)
    if(!is.null(subset.nhoods)){
        nhs <- nhs[subset.nhoods]
    }

    if(!any(is.na(fake.meta[unlist(nhs[nhood.x]),]$Nhood.Group))){
        fake.meta[unlist(nhs[nhood.x]),]$Nhood.Group[!is.na(fake.meta[unlist(nhs[nhood.x]),]$Nhood.Group)] <- NA
        } else{
            fake.meta[unlist(nhs[nhood.x]),]$Nhood.Group <- nhood.gr[i]
        }
}

# only compare against the other DA neighbourhoods
x <- x[, !is.na(fake.meta$Nhood.Group)]
fake.meta <- fake.meta[!is.na(fake.meta$Nhood.Group), ]

if(!is.null(subset.row)){
    x <- x[subset.row, , drop=FALSE]
}

exprs <- assay(x, assay)

# if there is only 1 group, then need to make sure that all neighbourhoods
# are not in this group - otherwise can't do any DGE testing
if(length(nhood.gr) == 1){
    if(sum(fake.meta$Nhood.Group == nhood.gr[1]) == nrow(fake.meta)){
        warning("All graph neighbourhoods are in the same group - cannot perform DGE testing. Returning NULL")
        return(NULL)
    }
}

if (isTRUE(aggregate.samples)) {
  fake.meta[,'sample_group'] <- paste(fake.meta[,"sample_id"], fake.meta[,"Nhood.Group"], sep="_")
  
  sample_gr_mat <- matrix(0, nrow=nrow(fake.meta), ncol=length(unique(fake.meta$sample_group)))
  colnames(sample_gr_mat) <- unique(fake.meta$sample_group)
  rownames(sample_gr_mat) <- rownames(fake.meta)
  
  for (s in colnames(sample_gr_mat)) {
    sample_gr_mat[which(fake.meta$sample_group == s),s] <- 1  
  }
  
  ## Summarise expression by sample
  exprs_smp <- matrix(0, nrow=nrow(exprs), ncol=ncol(sample_gr_mat))
  if (assay=='counts') {
    summFunc <- rowSums
  } else {
    summFunc <- rowMeans
  }
  
  for (i in 1:ncol(sample_gr_mat)){
    if (sum(sample_gr_mat[,i]) > 1) {
      exprs_smp[,i] <- summFunc(exprs[,which(sample_gr_mat[,i] > 0)])  
    } else {
      exprs_smp[,i] <- exprs[,which(sample_gr_mat[,i] > 0)]
    }
  }
  rownames(exprs_smp) <- rownames(exprs)
  colnames(exprs_smp) <- colnames(sample_gr_mat)
  
  smp_meta <- distinct(fake.meta, sample_group, Nhood.Group)
  rownames(smp_meta) <- smp_meta[,"sample_group"]
  
  fake.meta <- smp_meta
  exprs <- exprs_smp
}

marker.list <- list()
i.contrast <- c("TestTest - TestRef") # always use contrasts for this


  for(i in seq_along(nhood.gr)){
      i.meta <- smp_meta
      i.meta$Test <- "Ref"
      i.meta$Test[smp_meta$Nhood.Group == nhood.gr[i]] <- "Test"
      if(ncol(exprs_smp) > 1 & nrow(i.meta) > 1){
          i.design <- as.formula(" ~ 0 + Test")
          i.model <- model.matrix(i.design, data=i.meta)
          rownames(i.model) <- rownames(i.meta)
      }

      sink(file="/dev/null")
      gc()
      sink(file=NULL)

      if(assay == "logcounts"){
          i.res <- .perform_lognormal_dge(exprs_smp, i.model, model.contrasts=i.contrast,
                                          gene.offset=gene.offset)
      } else if(assay == "counts"){
          i.res <- .perform_counts_dge(exprs_smp, i.model, model.contrasts=i.contrast,
                                       gene.offset=gene.offset)
          colnames(i.res)[ncol(i.res)] <- "adj.P.Val"
      } else{
          warning("Assay type is not counts or logcounts - assuming (log)-normal distribution. Use these results at your peril")
          i.res <- .perform_lognormal_dge(exprs_smp, i.model,
                                          model.contrasts=i.contrast,
                                          gene.offset=gene.offset)
      }
      
      
      i.res$adj.P.Val[is.na(i.res$adj.P.Val)] <- 1
      i.res$logFC[is.infinite(i.res$logFC)] <- 0
      
      i.res <- i.res[, c("logFC", "adj.P.Val")]
      colnames(i.res) <- paste(colnames(i.res), nhood.gr[i], sep="_")
      marker.list[[paste0(nhood.gr[i])]] <- i.res
  }

  marker.df <- do.call(cbind.data.frame, marker.list)
  colnames(marker.df) <- gsub(colnames(marker.df), pattern="^[0-9]+\\.", replacement="")
  marker.df$GeneID <- rownames(i.res)
```

#### Table of significant DGE genes (FDR 10%)

```{r}
marker.df[marker.df$adj.P.Val_1 < 0.1,]
```



<!-- ```{r, fig.width=10} -->

<!-- colData(endo_milo)[["NhoodGroup"]] <- as.factor(nhood_markers$groups[colnames(endo_milo),"Nhood.Group"]) -->
<!-- plotUMAP(endo_milo, colour_by="NhoodGroup") + facet_wrap('colour_by') -->
<!-- # ggplot(umap_df, aes(UMAP_1, UMAP_2)) + geom_point(aes(color=as.factor(NhoodGroup))) -->
<!-- ``` -->


<!-- ```{r} -->
<!-- x = liver_milo -->
<!-- da.res = milo_res -->
<!-- da.fdr=0.1 -->
<!-- assay="logcounts" -->
<!-- overlap=1 -->
<!-- lfc.threshold=0.1 -->
<!-- merge.discord=FALSE -->
<!-- subset.row = endo_hvgs -->
<!-- gene.offset=TRUE -->
<!-- return.groups=TRUE -->
<!-- subset.nhoods = str_detect(milo_res$annotation_indepth, "Endo") -->
<!-- na.function="na.pass" -->
<!-- compute.new=FALSE -->


<!-- if(!is(x, "Milo")){ -->
<!--     stop("Unrecognised input type - must be of class Milo") -->
<!-- } else if(any(!assay %in% assayNames(x))){ -->
<!--     stop(paste0("Unrecognised assay slot: ", assay)) -->
<!-- } -->

<!-- if(is.null(na.function)){ -->
<!--     warning("NULL passed to na.function, using na.pass") -->
<!--     na.func <- get("na.pass") -->
<!-- } else{ -->
<!--     tryCatch({ -->
<!--         na.func <- get(na.function) -->
<!--     }, warning=function(warn){ -->
<!--         warning(warn) -->
<!--     }, error=function(err){ -->
<!--         stop(paste0("NA function ", na.function, " not recognised")) -->
<!--     }, finally={ -->
<!--     }) -->
<!-- } -->

<!-- n.da <- sum(na.func(da.res$SpatialFDR < da.fdr)) -->
<!-- if(!is.na(n.da) & n.da == 0){ -->
<!--     stop("No DA neighbourhoods found") -->
<!-- } -->

<!-- if(any(is.na(da.res$SpatialFDR))){ -->
<!--     warning("NA values found in SpatialFDR vector") -->
<!-- } -->

<!-- message(paste0("Found ", n.da, " DA neighbourhoods at FDR ", da.fdr*100, "%")) -->
<!-- ``` -->

<!-- ```{r} -->
<!-- nhs = nhoods(x) -->
<!-- nhood.adj = nhoodAdjacency(x) -->
<!-- da.res = milo_res -->
<!-- is.da = da.res$SpatialFDR < da.fdr -->
<!--     # # assume order of nhs is the same as nhood.adj -->
<!--     # if(!is.null(subset.nhoods)){ -->
<!--     #     if(mode(subset.nhoods) %in% c("character", "logical", "numeric")){ -->
<!--     #         # force use of logicals for consistency -->
<!--     #         if(mode(subset.nhoods) %in% c("character", "numeric")){ -->
<!--     #             sub.log <- names(nhs) %in% subset.nhoods -->
<!--     #         } else{ -->
<!--     #             sub.log <- subset.nhoods -->
<!--     #         } -->
<!--     #  -->
<!--     #         nhood.adj <- nhood.adj[sub.log, sub.log] -->
<!--     #  -->
<!--     #         if(length(is.da) == length(nhs)){ -->
<!--     #             nhs <- nhs[sub.log] -->
<!--     #             is.da <- is.da[sub.log] -->
<!--     #             da.res <- da.res[sub.log, ] -->
<!--     #         } else{ -->
<!--     #             stop("Subsetting `is.da` vector length does not equal nhoods length") -->
<!--     #         } -->
<!--     #     } else{ -->
<!--     #         stop(paste0("Incorrect subsetting vector provided:", class(subset.nhoods))) -->
<!--     #     } -->
<!--     # } else{ -->
<!--     #     if(length(is.da) != ncol(nhood.adj)){ -->
<!--     #         stop("Subsetting `is.da` vector length is not the same dimension as adjacency") -->
<!--     #     } -->
<!--     # } -->
<!--     #  -->
<!--     #  -->
<!--     # ## check for concordant signs - assume order is the same as nhoods -->
<!--     # if(isFALSE(merge.discord)){ -->
<!--     #     nonz.nhs <- colSums(nhood.adj) > 0 -->
<!--     #     ll_names <- expand.grid(c(1:length(nhs[nonz.nhs])), c(1:length(nhs[nonz.nhs]))) -->
<!--     #     ll_names["adj"] <- sapply(1:nrow(ll_names), function(x) nhood.adj[ll_names[x,1],ll_names[x,2]]) -->
<!--     #     nonz.pairs <- ll_names[,"adj"] > 0 -->
<!--     #      -->
<!--     #     concord.sign <- sapply((1:nrow(ll_names))[nonz.pairs], function(x) sign(da.res[nonz.nhs, ][as.numeric(ll_names[x, 1]), ]$logFC) == -->
<!--     #                                sign(da.res[nonz.nhs,][as.numeric(ll_names[x, 2]), ]$logFC)) -->
<!--     #  -->
<!--     #     ll_names[,"adj"][nonz.pairs][!concord.sign] <- 0 -->
<!--     #     nhood.adj[nonz.nhs, nonz.nhs] <- ll_names[, "adj"] -->
<!--     # } -->
<!--     #  -->
<!--     # if(overlap > 1){ -->
<!--     #     # loop over adj dimensions and mask out cells with insufficient overlapping cells -->
<!--     #     nonz.nhs <- colSums(nhood.adj) > 0 -->
<!--     #     ll_names <- expand.grid(c(1:length(nhs[nonz.nhs])), c(1:length(nhs[nonz.nhs]))) -->
<!--     #     ll_names["adj"] <- sapply(1:nrow(ll_names), function(x) nhood.adj[ll_names[x,1],ll_names[x,2]]) -->
<!--     #     nonz.pairs <- ll_names[,"adj"] > 0 -->
<!--     #     ll_names[nonz.pairs,"adj"] <- ifelse(ll_names[nonz.pairs,"adj"] < overlap, 0, ll_names[nonz.pairs,"adj"] ) -->
<!--     #     nhood.adj[nonz.nhs, nonz.nhs] <- ll_names[, "adj"] -->
<!--     # } -->
<!--     #  -->
<!--     # if(!is.null(lfc.threshold)){ -->
<!--     #     nonz.nhs <- colSums(nhood.adj) > 0 -->
<!--     #     ll_names <- expand.grid(c(1:length(nhs[nonz.nhs])), c(1:length(nhs[nonz.nhs]))) -->
<!--     #     ll_names["adj"] <- sapply(1:nrow(ll_names), function(x) nhood.adj[ll_names[x,1],ll_names[x,2]]) -->
<!--     #     nonz.pairs <- ll_names[,"adj"] > 0 -->
<!--     #      -->
<!--     #     # set adjacency to 0 for nhoods with lfc < threshold -->
<!--     #     lfc.pass <- sapply((1:nrow(ll_names))[nonz.pairs], function(x) (abs(da.res[nonz.nhs, ][as.numeric(ll_names[x, 1]), ]$logFC) >= lfc.threshold) & -->
<!--     #                            (abs(da.res[nonz.nhs, ][as.numeric(ll_names[x, 2]), ]$logFC) >= lfc.threshold)) -->
<!--     #      -->
<!--     #     ll_names[,"adj"][nonz.pairs][!lfc.pass] <- 0 -->
<!--     #     nhood.adj[nonz.nhs, nonz.nhs] <- ll_names[, "adj"] -->
<!--     # } -->
<!--     #  -->
<!--     # # binarise -->
<!--     # nhood.adj <- as.matrix((nhood.adj > 0) + 0) -->
<!--     #  -->
<!--     # n.dim <- ncol(nhood.adj) -->
<!--     # if(!isSymmetric(nhood.adj)){ -->
<!--     #     stop("Overlap matrix is not symmetric") -->
<!--     # } -->
<!--     #  -->
<!--     # if(nrow(nhood.adj) != ncol(nhood.adj)){ -->
<!--     #     stop("Non-square distance matrix - check nhood subsetting") -->
<!--     # } -->
<!--     #  -->
<!--     # g <- graph_from_adjacency_matrix(nhood.adj, mode="undirected", diag=FALSE) -->
<!--     # groups <- components(g)$membership -->
<!--     #  -->
<!--     # # only keep the groups that contain >= 1 DA neighbourhoods -->
<!--     # keep.groups <- intersect(unique(groups[is.da]), unique(groups)) -->
<!-- .group_nhoods_from_adjacency <- function(x, da.res, is.da, -->
<!--                                          merge.discord=FALSE, -->
<!--                                          lfc.threshold=NULL, -->
<!--                                          overlap=1, subset.nhoods=NULL){ -->

<!--     if(is.null(names(nhs))){ -->
<!--         warning("No names attributed to nhoods. Converting indices to names") -->
<!--         names(nhs) <- as.character(c(1:length(nhs))) -->
<!--     } -->

<!--   nh_graph <- nhoodGraph(x) -->
<!--   ## Change names from nhoodIndex to nhoodNumber -->
<!--   V(nh_graph)$name <- match(V(nh_graph)$name, unlist(nhoodIndex(x))) -->

<!--   max(as.numeric(V(nh_graph)$name)) -->

<!--   if(!is.null(subset.nhoods)){ -->
<!--     da.res <- da.res[subset.nhoods,]   -->
<!--   } -->

<!--   keep.nhoods <- da.res$SpatialFDR < da.fdr &  -->
<!--     abs(da.res$logFC) > lfc.threshold -->

<!-- # unlist(nhoodIndex(x)[subset.nhoods]) -->
<!-- # as.numeric(V(nh_graph)$name) %in% unlist(nhoodIndex(x)[subset.nhoods]) & -->
<!-- #   (da.res$SpatialFDR < da.fdr) -->

<!--   nh_graph <-  -->
<!--     igraph::induced_subgraph(nh_graph,  -->
<!--                              vids = as.character(da.res$Nhood[keep.nhoods]) -->
<!--                              # vids = which(as.numeric(V(nh_graph)$name) %in% unlist(nhoodIndex(x)[da.res[keep.nhoods,"Nhood"]])) -->
<!--                              ) -->
<!--   groups <- components(nh_graph)$membership -->
<!--   keep.groups <- intersect(unique(groups[is.da]), unique(groups)) -->



<!--   return(groups[groups %in% keep.groups]) -->
<!-- } -->


<!-- da.res %>% -->
<!--   mutate(keep=keep.nhoods) %>% -->
<!--   ggplot(aes(logFC, -log10(SpatialFDR), color=keep)) + -->
<!--   geom_point() -->
<!-- ggraph(nh_graph) + geom_edge_link2() -->
<!-- ``` -->


<!-- ```{r} -->
<!-- lfc.threshold = 0 -->
<!-- if(!is.null(nhoodAdjacency(x)) & isFALSE(compute.new)){ -->
<!--     message("nhoodAdjacency found - using for nhood grouping") -->
<!--     nhs.da.gr <- .group_nhoods_from_adjacency(x, -->
<!--                                               da.res=milo_res, -->
<!--                                               is.da=da.res$SpatialFDR < da.fdr, -->
<!--                                               subset.nhoods = subset.nhoods, -->
<!--                                               merge.discord=merge.discord, -->
<!--                                               overlap=overlap, -->
<!--                                               lfc.threshold=lfc.threshold -->
<!--                                               ) -->
<!-- } else{ -->
<!--     nhs.da.gr <- .group_nhoods_by_overlap(nhoods(x), -->
<!--                                           da.res=da.res, -->
<!--                                           is.da=da.res$SpatialFDR < da.fdr, -->
<!--                                           merge.discord=merge.discord, -->
<!--                                           overlap=overlap, -->
<!--                                           ) # returns a vector group values for each nhood -->
<!-- } -->

<!-- nhood.gr <- unique(nhs.da.gr) -->

<!-- names(nhs.da.gr) -->
<!-- # names(nhs.da.gr) <- match(names(nhs.da.gr), nhoodIndex(x)) -->
<!-- nhs.da.gr.all <- rep(NA, length(nhoods(x))) -->

<!-- nhs.da.gr.all[as.numeric(names(nhs.da.gr))] <- nhs.da.gr -->


<!-- # perform DGE _within_ each group of cells using the input design matrix -->
<!-- message(paste0("Nhoods aggregated into ", length(nhood.gr), " groups")) -->
<!-- ``` -->

<!-- ```{r} -->
<!-- cell2nhoods <- function(x){ -->
<!--   nhs <- lapply(nhoods(x), function(nh) as.vector(nh)) -->
<!--   # nhood_mat <- matrix(nrow = ncol(x), ncol=length(nhoods(x))) -->
<!--   nhood_mat <- sapply(nhs, function(nh) ifelse(1:ncol(x) %in% nh, 1, 0)) -->
<!--   return(nhood_mat) -->
<!-- } -->
<!-- ``` -->


<!-- ```{r} -->

<!-- nh_mat <- cell2nhoods(x) -->
<!-- fake.meta <- data.frame("CellID"=colnames(x)) -->
<!-- # rownames(fake.meta) <- fake.meta$CellID -->

<!-- for(i in seq_along(nhood.gr)){ -->
<!--   gr.cells <- rowSums(nh_mat[,which(nhs.da.gr.all == nhood.gr[i])]) > 0 -->
<!--   fake.meta[,paste0("group_", i)] <- gr.cells -->
<!-- } -->
<!-- # dim(fake.meta[rowSums(nh_mat[,which(nhs.da.gr.all == nhood.gr[i])]) > 0,]) -->
<!-- # # nhs.da.gr.all <- ifelse(1:length(nhoods(x)) %in% match(names(nhs.da.gr), nhoodIndex(x)), nhs.da.gr, NA) -->
<!-- # for(i in seq_along(nhood.gr)){ -->
<!-- #     nhood.x <- nhs.da.gr.all == nhood.gr[i] -->
<!-- #     nhood.x[is.na(nhood.x)] <- FALSE -->
<!-- #     fake.meta[unlist(nhoods(x)[da.res$SpatialFDR < da.fdr][nhood.x]), paste0("group_", i)] <- nhood.gr[i] -->
<!-- # } -->

<!-- fake.meta <- -->
<!--   fake.meta %>% -->
<!--   mutate(Nhood.Group = ifelse(group_1, 1, ifelse(group_2, 2, NA)))  -->
<!-- rownames(fake.meta) <- fake.meta$CellID -->

<!-- colData(endo_milo)[["group_1"]] <- as.factor(fake.meta[colnames(endo_milo),"group_1"]) -->
<!-- colData(endo_milo)[["group_2"]] <- as.factor(fake.meta[colnames(endo_milo),"group_2"]) -->
<!-- colData(endo_milo)[["NhoodGroup"]] <- as.factor(fake.meta[colnames(endo_milo),"NhoodGroup"]) -->
<!-- plotUMAP(endo_milo, colour_by="NhoodGroup") -->

<!-- ``` -->



<!-- ```{r} -->

<!-- # only compare against the other DA neighbourhoods -->
<!-- x <- x[, !is.na(fake.meta$Nhood.Group)] -->
<!-- fake.meta <- fake.meta[!is.na(fake.meta$Nhood.Group), ] -->

<!-- if(!is.null(subset.row)){ -->
<!--     x <- x[subset.row, , drop=FALSE] -->
<!-- } -->

<!--     exprs <- assay(x, assay) -->

<!--     marker.list <- list() -->
<!--     i.contrast <- c("TestTest - TestRef") # always use contrasts for this -->

<!--     # if there is only 1 group, then need to make sure that all neighbourhoods -->
<!--     # are not in this group - otherwise can't do any DGE testing -->
<!--     if(length(nhood.gr) == 1){ -->
<!--         if(sum(fake.meta$Nhood.Group == nhood.gr[1]) == nrow(fake.meta)){ -->
<!--             warning("All graph neighbourhoods are in the same group - cannot perform DGE testing. Returning NULL") -->
<!--             return(NULL) -->
<!--         } -->
<!--     } -->

<!--     for(i in seq_along(nhood.gr)){ -->
<!--         i.meta <- fake.meta -->
<!--         i.meta$Test <- "Ref" -->
<!--         i.meta$Test[fake.meta$Nhood.Group == nhood.gr[i]] <- "Test" -->

<!--         if(ncol(exprs) > 1 & nrow(i.meta) > 1){ -->
<!--             i.design <- as.formula(" ~ 0 + Test") -->
<!--             i.model <- model.matrix(i.design, data=i.meta) -->
<!--             rownames(i.model) <- rownames(i.meta) -->
<!--         } -->

<!--         sink(file="/dev/null") -->
<!--         gc() -->
<!--         sink(file=NULL) -->

<!--         if(assay == "logcounts"){ -->
<!--             i.res <- .perform_lognormal_dge(exprs, i.model, model.contrasts=i.contrast, -->
<!--                                             gene.offset=gene.offset) -->
<!--         } else if(assay == "counts"){ -->
<!--             i.res <- .perform_counts_dge(exprs, i.model, model.contrasts=i.contrast, -->
<!--                                          gene.offset=gene.offset) -->
<!--         } else{ -->
<!--             warning("Assay type is not counts or logcounts - assuming (log)-normal distribution. Use these results at your peril") -->
<!--             i.res <- .perform_lognormal_dge(exprs, i.model, -->
<!--                                             model.contrasts=i.contrast, -->
<!--                                             gene.offset=gene.offset) -->
<!--         } -->

<!--         i.res$adj.P.Val[is.na(i.res$adj.P.Val)] <- 1 -->
<!--         i.res$logFC[is.infinite(i.res$logFC)] <- 0 -->

<!--         i.res <- i.res[, c("logFC", "adj.P.Val")] -->
<!--         colnames(i.res) <- paste(colnames(i.res), nhood.gr[i], sep="_") -->
<!--         marker.list[[paste0(nhood.gr[i])]] <- i.res -->
<!--     } -->

<!--     marker.df <- do.call(cbind.data.frame, marker.list) -->
<!--     colnames(marker.df) <- gsub(colnames(marker.df), pattern="^[0-9]+\\.", replacement="") -->
<!--     marker.df$GeneID <- rownames(i.res) -->

<!--     if(isTRUE(return.groups)){ -->
<!--         out.list <- list("groups"=fake.meta, "dge"=marker.df) -->
<!--         return(out.list) -->
<!--     }else{ -->
<!--         return(marker.df) -->
<!--     } -->
<!-- nhood_markers <- out.list -->
<!-- sum(marker.df$adj.P.Val_1 < 0.001) -->

<!-- bcell_genes <- c('MS4A1', "CD19", "FCGR2B", "IGHA1") -->

<!-- plotUMAP(endo_milo, colour_by=bcell_genes[3]) -->

<!-- ``` -->

#### Visualize as volcano 

```{r, fig.height=6, fig.width=10, message=FALSE, warning=FALSE}
marker.df %>%
   mutate(adj.P.Val_1 = ifelse(- log10(adj.P.Val_1) > 300, 1e-300, adj.P.Val_1)) %>%
  mutate(label=ifelse(-log10(adj.P.Val_1) > 1.7, GeneID, NA)) %>%
  ggplot(aes(logFC_2, -log10(adj.P.Val_1), 
             # color=highlight
             )) + 
  geom_point() +
  ggrepel::geom_text_repel(aes(label=label)) +
  xlab("logFC") + ylab("- log10(Adj. P value)") +
  theme_bw(base_size = 22)
  
  
```

#### Visualize as heatmap 
(gene expression values are scaled between 0 and 1 for each gene)

<!-- ```{r, fig.height=6, fig.width=9, message=FALSE, warning=FALSE} -->
<!-- marker_genes_down <- marker.df %>% -->
<!--   dplyr::filter(adj.P.Val_1 < 0.1 & logFC_2 < -1) %>% -->
<!--   pull(GeneID) -->


<!-- plotNhoodExpressionDA(liver_milo, milo_res, marker_genes_down, cluster_features = TRUE,  -->
<!--                       alpha = 0.1, -->
<!--                       scale_to_1 = TRUE, -->
<!--                       subset.nhoods = milo_res$annotation_lineage=="Cholangiocytes", -->
<!--                       show_rownames = TRUE -->
<!--                       ) + -->
<!--   ylab("DE genes") + -->
<!--    theme(legend.text = element_text(size=22), legend.title = element_text(size=24)) + -->
<!--   plot_layout(heights = c(1,4)) -->
<!-- ``` -->


```{r, fig.height=10, fig.width=9, message=FALSE, warning=FALSE}
marker_genes_up <- marker.df %>%
  dplyr::filter(adj.P.Val_1 < 0.01 & logFC_2 > 2) %>%
  pull(GeneID)


plotNhoodExpressionDA(liver_milo, milo_res, marker_genes_up, cluster_features = TRUE, 
                      alpha = 0.1,
                      scale_to_1 = TRUE,
                      subset.nhoods = milo_res$annotation_lineage=="Cholangiocytes",
                      show_rownames = TRUE
                      ) +
  ylab("DE genes") +
   theme(legend.text = element_text(size=22), legend.title = element_text(size=24)) +
  plot_layout(heights = c(1,10))


```

### GO term analysis

```{r}
marker_genes_up <- marker.df %>%
  dplyr::filter(adj.P.Val_1 < 0.1 & logFC_2 > 0) %>%
  pull(GeneID)

em_up <- enricher(marker_genes_up, TERM2GENE=m_t2g, pAdjustMethod = "fdr", 
                  universe = rownames(liver_milo)
                  )

em_res_up <- em_up@result[em_up@result$qvalue < 0.05,] %>%
  dplyr::select(- c(Description))
```

```{r, fig.height=8, fig.width=15}

em_res_up %>%
  top_n(40, -log10(qvalue)) %>%
  mutate(Term=factor(ID, levels=rev(unique(ID)))) %>%
  ggplot(aes(Term, -log10(qvalue))) +
  geom_point() +
  coord_flip() +
  xlab("GO Biological Function") + ylab("-log10(Adj. p-value)") +
  theme_bw(base_size=14) +
  ggtitle("Markers of healthy")
```

### Full GO enrichment - up in cirrhotic
(`geneID` column indicates which genes belong to gene ontology)
```{r}
em_res_up
```

---

Assemble figure
```{r, fig.height=25, fig.width=19}
fig4_bottom <- ((fig4_bleft + plot_layout()) |
      ((fig4_bright1 + plot_layout(tag_level = 'keep')) / (fig4_bbright + plot_layout())) +
      plot_layout(heights = c(1,1.6))
   ) +
  plot_layout(widths=c(1,1.2))

(fig4_top / fig4_bottom) +
  plot_layout(heights=c(1,1.8))  +
  ggsave("~/milo_output/fig4.pdf", height = 26, width = 22, useDingbats=FALSE)
  # ggsave("~/milo/ms/figures/figs/figure5.pdf", height = 26, width = 22, useDingbats=FALSE)
```

Assemble supplementary figure

```{r, fig.width=9, fig.height=7}
p1 <- plot_grid(pval_hist,
                volcano + ylab("- log10(Spatial FDR)"),
                frac_hist, nrow=1,
                labels = c("A", "B", "C"))
cowplot::plot_grid(p1, go_plot, rel_heights = c(1,1.5), ncol=1, labels = c("", "D")) +
  ggsave("~/milo/ms/supplement/suppl_figs/suppl_fig6.pdf", height = 7, width=8)
  ggsave("~/milo_output/liver_supplementary.png", height = 7, width=8)

```

<!-- Let's check the genes identified as markers for the disease subtypes. Are they significantly differentially expressed between DA neighbourhoods? Yes they are! -->

<!-- ```{r} -->
<!-- disease_endo_markers <- c("ACKR1", 'CD34',"VWA1") -->

<!-- data.frame(markers$negLogFC_2) %>% -->
<!--   filter(FDR < 0.05) %>% -->
<!--   .[disease_endo_markers,] -->

<!-- data.frame(markers$negLogFC_1) %>% -->
<!--   filter(FDR < 0.05) %>% -->
<!--   .[disease_endo_markers,] -->
<!-- ``` -->

<!-- Visualize some of the markers that differentiate DA neighbourhoods, by plotting the percent of cells expressing each gene in a nhood. -->

<!-- ```{r} -->
<!-- ## Define plotting functions -->
<!-- .calculate_nhood_perc_expression <- function(milo, nhoods, gene){ -->
<!--   gene_cnts <- counts(milo)[gene,] -->
<!--   perc_expr <- sapply(nhoods(milo)[nhoods], function(x) sum(gene_cnts[x]>0)/length(x)) -->
<!--   perc_expr <- setNames(perc_expr, nhoods) -->
<!--   return(perc_expr) -->
<!--   } -->

<!-- .plot_nhood_expression <- function(milo, nhoods, features){ -->
<!--   perc_expr_mat <- sapply(features,  -->
<!--                           function(x) .calculate_nhood_perc_expression(milo, nhoods, x)) -->

<!--   pl_df <- data.frame(perc_expr_mat) %>% -->
<!--     rownames_to_column("Nhood") %>% -->
<!--     mutate(Nhood=as.double(Nhood)) %>% -->
<!--     left_join(milo_res) %>% -->
<!--     mutate(logFC_rank=rank(logFC))  -->

<!--   pl_top <- pl_df %>% -->
<!--       mutate(is_signif = ifelse(SpatialFDR < 0.1, "SpatialFDR < 0.1", NA)) %>% -->
<!--       ggplot(aes(logFC_rank, logFC)) + -->
<!--       geom_hline(yintercept = 0, linetype=2) + -->
<!--       geom_point(size=0.2) + -->
<!--       geom_point(data=.%>% filter(!is.na(is_signif)), aes(color=is_signif), size=0.5) + -->
<!--       theme_bw() + -->
<!--       scale_color_manual(values="red", name="") + -->
<!--       theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.title.x = element_blank()) -->

<!--   pl_bottom <- pl_df %>% -->
<!--     pivot_longer(cols=features, names_to='feature', values_to="perc_expressed") %>% -->
<!--     mutate(feature=factor(feature, levels=features)) %>% -->
<!--     ggplot(aes(logFC_rank, feature, fill=perc_expressed)) +  -->
<!--     geom_tile() + -->
<!--     scale_fill_viridis_c(option="magma") + -->
<!--     ggbio::theme_clear() -->

<!--   (pl_top / pl_bottom) + plot_layout(heights = c(1,2)) -->
<!-- } -->
<!-- ``` -->


<!-- ```{r, fig.width=10, fig.height=12} -->
<!-- endo_nhoods <- endo_res %>%  pull(Nhood) -->

<!-- ## Select genes and sort by AUC -->
<!-- feats_neg2 <- -->
<!--   data.frame(markers$negLogFC_2) %>%  -->
<!--   top_n(50, - log10(FDR)) %>% -->
<!--   arrange(AUC.posLogFC_1) %>% -->
<!--   rownames() -->

<!-- .plot_nhood_expression(liver_milo, endo_nhoods, features=feats) -->
<!-- ``` -->

<!-- As described in the paper, we have that genes associated with extracellular matrix organization (e.g. VIM, ) are over expressed  -->


<!-- ```{r, fig.width=10, fig.height=7} -->
<!-- ## Select genes and sort by AUC -->
<!-- feats_neg1 <- -->
<!--   data.frame(markers$negLogFC_1) %>%  -->
<!--   top_n(50, - log10(FDR)) %>% -->
<!--   arrange(AUC.posLogFC_1) %>% -->
<!--   rownames() -->

<!-- .plot_nhood_expression(liver_milo, endo_nhoods, features=feats) -->

<!-- ``` -->

<!-- ```{r, fig.width=10, fig.height=7} -->
<!-- feats_neg1vsNeg2 <- -->
<!--   data.frame(markers$negLogFC_1) %>%  -->
<!--   top_n(50, - log10(FDR)) %>% -->
<!--   arrange(AUC.negLogFC_2) %>% -->
<!--   rownames() -->

<!-- .plot_nhood_expression(liver_milo, endo_nhoods, features=feats_neg1vsNeg2) -->
<!-- ``` -->


<!-- Look just at Endothelia (5) where you have both positive and negative fold-changes -->

<!-- ```{r} -->
<!-- endo5_nhoods <- milo_res %>%  -->
<!--   filter(annotation_indepth=="Endothelia (5)" & annotation_indepth_fraction > 0.7) %>% -->
<!--   pull(Nhood) -->

<!-- perc_expr_mat <- sapply(features,  -->
<!--                         function(x) .calculate_nhood_perc_expression(liver_milo, endo5_nhoods, x)) -->

<!-- pl_df <- data.frame(perc_expr_mat) %>% -->
<!--   rownames_to_column("Nhood") %>% -->
<!--   mutate(Nhood=as.double(Nhood)) %>% -->
<!--   left_join(milo_res) %>% -->
<!--   mutate(logFC_rank=rank(logFC))  -->

<!-- pl_top <- pl_df %>% -->
<!--     mutate(is_signif = ifelse(SpatialFDR < 0.1, "SpatialFDR < 0.1", NA)) %>% -->
<!--     ggplot(aes(logFC_rank, logFC)) + -->
<!--     geom_hline(yintercept = 0, linetype=2) + -->
<!--     geom_point(size=0.2) + -->
<!--     geom_point(data=.%>% filter(!is.na(is_signif)), aes(color=is_signif), size=0.5) + -->
<!--     theme_bw() + -->
<!--     scale_color_manual(values="red", name="") + -->
<!--     theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.title.x = element_blank()) -->

<!-- pl_bottom <- pl_df %>% -->
<!--   pivot_longer(cols=features, names_to='feature', values_to="perc_expressed") %>% -->
<!--   ggplot(aes(logFC_rank, feature, fill=perc_expressed)) +  -->
<!--   geom_tile() + -->
<!--   scale_fill_viridis_c(option="magma") + -->
<!--   theme_clear() -->

<!-- (pl_top / pl_bottom) + plot_layout(heights = c(1,2)) -->
<!-- ``` -->

<!-- How to find association de novo in Endo 5? -->

<!-- ```{r, fig.height=8, fig.width=8} -->
<!-- ## Find most highly variable genes in this cluster -->
<!-- endo5_milo <- liver_milo[,which(colData(liver_milo)[["annotation_indepth"]]=="Endothelia (5)")] -->
<!-- dec_endo5 <- modelGeneVar(endo5_milo) -->
<!-- endo5_hvgs <- getTopHVGs(dec_endo5, n=1000) -->

<!-- perc_expr_mat <- sapply(endo5_hvgs, function(x) .calculate_nhood_perc_expression(liver_milo, endo5_nhoods, x)) -->

<!-- milo_res_endo5 <- milo_res[which(milo_res$Nhood %in% endo5_nhoods),] -->

<!-- fc_cor <- apply(perc_expr_mat, 2, function(x) Hmisc::rcorr(x, milo_res_endo5$logFC)$r[1,2]) -->
<!-- fc_cor_pval <- apply(perc_expr_mat, 2, function(x) Hmisc::rcorr(x, milo_res_endo5$logFC)$P[1,2]) -->

<!-- cor_feats <- names(which(abs(fc_cor) > 0.6 & abs(fc_cor_pval) < 0.05)) -->
<!-- cor_feats_ordered <- cor_feats[order(fc_cor[cor_feats])] -->

<!-- pl_df <- data.frame(perc_expr_mat[,cor_feats]) %>% -->
<!--   rownames_to_column("Nhood") %>% -->
<!--   mutate(Nhood=as.double(Nhood)) %>% -->
<!--   left_join(milo_res) %>% -->
<!--   mutate(logFC_rank=rank(logFC))  -->

<!-- pl_top <- pl_df %>% -->
<!--     mutate(is_signif = ifelse(SpatialFDR < 0.1, "SpatialFDR < 0.1", NA)) %>% -->
<!--     ggplot(aes(logFC_rank, logFC)) + -->
<!--     geom_hline(yintercept = 0, linetype=2) + -->
<!--     geom_point(size=0.2) + -->
<!--     geom_point(data=.%>% filter(!is.na(is_signif)), aes(color=is_signif), size=0.5) + -->
<!--     theme_bw() + -->
<!--     scale_color_manual(values="red", name="") + -->
<!--     theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.title.x = element_blank()) -->

<!-- pl_bottom <- pl_df %>% -->
<!--   pivot_longer(cols=str_replace(cor_feats_ordered, "-", "."), names_to='feature', values_to="perc_expressed") %>% -->
<!--   mutate(feature=factor(feature, levels=str_replace(cor_feats_ordered, "-", "."))) %>% -->
<!--   ggplot(aes(logFC_rank, feature, fill=perc_expressed)) +  -->
<!--   geom_tile() + -->
<!--   scale_fill_viridis_c(option="magma") + -->
<!--   theme_clear() -->

<!-- (pl_top / pl_bottom) + plot_layout(heights = c(1,2)) -->
<!-- ``` -->



<!-- ```{r} -->
<!-- ## Run common PCA -->
<!-- merged_cnts <- cbind(nhoodExpression(endo_milo), logcounts(endo_milo)[hvgs,]) -->
<!-- merged_cnts_scaled <- t(scale(t(merged_cnts))) -->
<!-- merged_pca <- BiocSingular::runPCA(t(merged_cnts_scaled), rank=30, center=FALSE) -->
<!-- pca_mat <- rbind(merged_pca$x[(ncol(endo_milo)+1):(ncol(endo_milo)+(length(nhoods(endo_milo)))),], merged_pca$x[colnames(endo_milo),]) -->
<!-- ## Add to slot nhoodsReducedDim -->
<!-- nhoodReducedDim(endo_milo, "PCA") <- pca_mat -->

<!-- ## Run UMAP on joint PCA -->
<!-- umap_out <- uwot::umap(nhoodReducedDim(endo_milo, "PCA"), n_neighbors = 20, n_components = 2, scale=FALSE) -->
<!-- colnames(umap_out) <- c("UMAP_1", "UMAP_2") -->
<!-- nhoodReducedDim(endo_milo, "UMAP") <- umap_out -->

<!-- ``` -->

<!-- ```{r} -->
<!-- split_by=NULL -->
<!-- ## Join test results and dimensionality reductions -->
<!-- rdim_df <- data.frame(nhoodReducedDim(endo_milo, "UMAP")[,c(1,2)]) -->
<!-- colnames(rdim_df) <- c('X','Y') -->

<!-- n_nhoods <- length(nhoods(endo_milo)) -->
<!-- rdim_df[,"Nhood"] <- ifelse(1:nrow(rdim_df) %in% c(1:n_nhoods), c(1:n_nhoods), NA) -->
<!-- viz_df  <- left_join(rdim_df, milo_res[which(milo_res$annotation_lineage=="Endothelia"),], by="Nhood") -->
<!-- viz_df[["nhIndex"]] <- unlist(ifelse(!is.na(viz_df$Nhood), nhoodIndex(endo_milo)[viz_df$Nhood],NA)) -->
<!-- viz_df[is.na(viz_df["nhIndex"]),'nhIndex'] <- 1:ncol(endo_milo) # Add index also to single-cells -->

<!-- if (!is.null(split_by)){ -->
<!--   split_df <- data.frame(split_by=colData(endo_milo)[,split_by]) -->
<!--   split_df[,"nhIndex"] <- 1:nrow(split_df) -->
<!--   viz_df  <- left_join(viz_df, split_df, by="nhIndex") -->
<!-- } -->

<!-- filter_alpha=0.1 -->
<!-- ## Filter significant DA nhoods -->
<!-- if (!is.null(filter_alpha)) { -->
<!--   if (filter_alpha > 0) { -->
<!--     viz_df <- mutate(viz_df, logFC = ifelse(SpatialFDR > filter_alpha, NA, logFC)) -->
<!--   } -->
<!-- } -->

<!-- ## Plot -->
<!-- pt_size=1 -->
<!-- nhood_reduced_dims="UMAP" -->
<!-- components=c(1,2) -->
<!--   pl <- -->
<!--     ggplot(data = arrange(viz_df, abs(logFC)), -->
<!--            aes(X, Y)) + -->
<!--     geom_point(aes(color = ''), size = pt_size / 3, alpha = 0.5) + -->
<!--     geom_point( -->
<!--       data = . %>% filter(!is.na(SpatialFDR)), -->
<!--       aes(fill = logFC), -->
<!--       size = pt_size, -->
<!--       stroke = 0.1, -->
<!--       # colour="black", -->
<!--       shape = 21 -->
<!--     ) + -->
<!--     scale_fill_gradient2( -->
<!--       midpoint = 0, -->
<!--       high = "red", -->
<!--       low = "blue", -->
<!--       name = "log-FC" -->
<!--     ) + -->
<!--     xlab(paste(nhood_reduced_dims, components[1], sep="_")) + -->
<!--     ylab(paste(nhood_reduced_dims, components[2], sep="_")) -->

<!--   if (!is.null(split_by)) { -->
<!--     pl <- pl + facet_wrap(split_by~.) -->
<!--   } -->
<!--   if (!is.null(filter_alpha)) { -->
<!--     pl <- pl + -->
<!--       scale_color_manual(values = 'grey', label = paste("SpatialFDR >", round(filter_alpha, 2))) + -->
<!--       guides(colour = guide_legend( -->
<!--         '', -->
<!--         override.aes = list( -->
<!--           shape = 21, -->
<!--           colour = "black", -->
<!--           fill = "grey50", -->
<!--           size = pt_size, -->
<!--           alpha = 1, -->
<!--           stroke = 0.1 -->
<!--         ) -->
<!--       )) -->
<!--   } else { -->
<!--     pl <- pl + -->
<!--       scale_color_manual(values = 'grey') + -->
<!--       guides(color="none") -->
<!--   } -->

<!--   pl <- pl + -->
<!--     theme_classic(base_size = 16) + -->
<!--     theme( -->
<!--       axis.ticks = element_blank(), -->
<!--       axis.text = element_blank(), -->
<!--       plot.title = element_text(hjust = 0.5) -->
<!--     ) -->
<!-- pl -->
<!-- ``` -->

<!-- ```{r} -->
<!-- endo_milo <- scater::runPCA(endo_milo, subset_row=hvgs) -->
<!-- ``` -->



<!-- --- -->
<!-- ## Old (before I got dataset from authors) -->

<!-- Using data from [Ramachandran et al. 2019](https://www.nature.com/articles/s41586-019-1631-3#Sec1) (GEO accessiion: GSE136103).  -->

<!-- ```{r} -->
<!-- human_files <- list.files("~/Downloads/GSE136103_RAW", pattern="GSM40411.._healthy|cirrhotic", full.names = TRUE)  -->

<!-- prefixes <- str_remove(human_files, "barcodes.tsv.gz|genes.tsv.gz|matrix.mtx.gz") %>% -->
<!--   # str_remove(".+/") %>% -->
<!--   unique()  -->

<!-- sce_ls <- lapply(prefixes, function(x) read10xCounts(x, type="prefix")) -->
<!-- liver_sce <- purrr::reduce(sce_ls, cbind) -->

<!-- ## Make colData info -->
<!-- new_coldata <- colData(liver_sce) %>% -->
<!--   data.frame() %>% -->
<!--   mutate(Sample=str_remove(Sample, ".+/") %>% str_remove("_$")) %>% -->
<!--   separate(Sample, into=c("col1", "Patient", "Sort"), sep = "_", remove=FALSE) %>%  -->
<!--   mutate(Condition=str_remove(Patient, ".$")) %>% -->
<!--   select(-col1) %>% -->
<!--   mutate(Cell_id = str_c(Sample, "_",Barcode)) %>% -->
<!--   column_to_rownames("Cell_id") -->

<!-- colnames(liver_sce) <- rownames(new_coldata) -->
<!-- colData(liver_sce) <- DataFrame(new_coldata) -->

<!-- # saveRDS(liver_sce, "~/GSE136103_SingleCellExperiment.RDS") -->
<!-- ``` -->
<!-- ```{r} -->
<!-- liver_sce <- readRDS("~/GSE136103_SingleCellExperiment.RDS") -->
<!-- ``` -->

<!-- QC metrics show that the outlier cells are already filtered (following [this](https://osca.bioconductor.org/overview.html#data-processing-and-downstream-analysis)) -->

<!-- ```{r, fig.width=10,fig.height=8} -->
<!-- # is.mito <- grepl("^MT-", rownames(liver_sce)) -->
<!-- qcstats <- perCellQCMetrics(liver_sce) -->

<!-- colData(liver_sce) <- cbind(colData(liver_sce), qcstats) -->

<!-- plotColData(liver_sce, x="Sample", y = "total", other_fields = "Condition") + -->
<!--   scale_y_log10() + -->
<!--   facet_wrap(~Condition, scales = "free") + -->
<!--   ggtitle("total counts")  -->

<!-- plotColData(liver_sce, x="Sample", y = "detected", other_fields = "Condition") + -->
<!--   facet_wrap(~Condition, scales = "free") + -->
<!--   ggtitle("Detected genes")  -->

<!-- ``` -->

<!-- ### Normalization -->

<!-- ```{r} -->
<!-- lib_sf <- librarySizeFactors(liver_sce) -->
<!-- hist(log10(lib_sf), xlab="Log10[Size factor]", col='grey80') -->
<!-- ``` -->
<!-- ```{r} -->
<!-- set.seed(100) -->
<!-- liver_sce <- logNormCounts(liver_sce) -->
<!-- assayNames(liver_sce) -->
<!-- ``` -->

<!-- ### Feature selection -->

<!-- ```{r} -->
<!-- library(scran) -->
<!-- dec_liver <- modelGeneVar(liver_sce) -->

<!-- # Visualizing the fit: -->
<!-- fit_liver <- metadata(dec_liver) -->
<!-- plot(fit_liver$mean, fit_liver$var, xlab="Mean of log-expression", -->
<!--     ylab="Variance of log-expression") -->

<!-- hvgs <- getTopHVGs(dec_liver, n=3000) -->
<!-- ``` -->



<!-- ### Dim reduction -->

<!-- ```{r, fig.height=14, fig.width=14} -->
<!-- liver_sce <- scater::runPCA(liver_sce, subset_row=hvgs, ncomponents=30) -->
<!-- reducedDim(liver_sce, "PCA") <- reducedDim(liver_sce, "PCA")[,1:11] -->
<!-- plotPCA(liver_sce, colour_by="Condition", ncomponents=3) -->
<!-- ``` -->

<!-- ```{r, fig.height=8, fig.width=8} -->
<!-- liver_sce <- runUMAP(liver_sce, dimred="PCA", ncomponents=2) -->

<!-- scater::plotUMAP(liver_sce, colour_by="Condition", point_alpha=1,  point_size=0.8)  -->
<!-- scater::plotUMAP(liver_sce, colour_by="Sort", point_alpha=0.3,  point_size=0.5) -->
<!-- scater::plotUMAP(liver_sce, colour_by="Patient", point_alpha=0.3,  point_size=0.5) -->
<!-- ``` -->
<!-- ```{r, fig.height=10, fig.width=10} -->
<!-- rownames(liver_sce) <- rowData(liver_sce)$Symbol -->
<!-- scater::plotUMAP(liver_sce, colour_by="CD3D", point_alpha=0.3, point_size=0.5) -->
<!-- ``` -->



